## Examples

### `Block`

In [1]:
from pathlib import Path
from src.blocks import BlockAnalyzer, Block, block_decomposition, Decomposer

In [4]:
Block(K=[1,2,3],i=2,j=7,label="ACGTAC").to_positional_string()

PositionalString(b='ACGTAC', i=2, j=7)

In [2]:
path_blocks = Path("out/blocks/Cluster_10644.json")
list_blocks = BlockAnalyzer()._load_list_blocks(path_blocks)

In [3]:
list_blocks

[Block(K=(1, 2), i=0, j=75, label='CACGATTCTCCCTTTGAGTTGATGAGGTTTCAGGGAAAAGGATAGCTGATTCTCCGCTTTTGCAAGTATGAAAGGC'),
 Block(K=(0, 1, 2), i=0, j=15, label='CACGATTCTCCCTTTG'),
 Block(K=(0, 1), i=17, j=81, label='GTTGATGAGGTTTCAGGGAAAAGGATAGCTGATTCTCCGCTTTTGCAAGTATGAAAGGCGAAAAA'),
 Block(K=(0, 1, 2), i=17, j=75, label='GTTGATGAGGTTTCAGGGAAAAGGATAGCTGATTCTCCGCTTTTGCAAGTATGAAAGGC'),
 Block(K=(0, 1, 2), i=77, j=81, label='AAAAA')]

In [4]:
decomposer=Decomposer()
new_blocks = decomposer(list_blocks)

In [5]:
len(new_blocks), len(list_blocks)

(9, 5)

___
## Sub alignments

In [3]:
import random
vec = [ random.choice([True,False]) for _ in range(10)] 

In [4]:
vec

[True, True, True, True, False, True, False, False, True, True]

In [62]:
def split_vec_by_consecutive_values(vec):
    splits=[]
    curr_pos = 0
    start = 0
    end   = 0 

    while curr_pos < len(vec)-1:
        
        if vec[curr_pos] == vec[curr_pos+1]:
            end = curr_pos + 1 
        else:
            splits.append((start,end))
            start = end + 1 
            end = start

        # move one position 
        curr_pos +=1

    # append last consecutive (positions) of values
    splits.append((start, end))

    return splits

splits = split_vec_by_consecutive_values(vec)

# to access the values 
for start,end in splits:
    print(vec[start:end+1])

[True, True, True, True]
[False]
[True]
[False, False]
[True, True]


In [64]:
end-start+1

2

## Suffix tree and maximal repeats

In [2]:
import json
from Bio import AlignIO
from suffix_tree import Tree

In [3]:
filename = "../data/Cluster_4991.fa"
align = AlignIO.read(filename, "fasta")
n_cols = align.get_alignment_length()
n_seqs = len(align)
seqs = list(set([str(record.seq) for record in align]))
n_unique_seqs = len(seqs)

# def compute_max_blocks(seqs):
tree = Tree({num: enumerate(seq) for num, seq in enumerate(seqs)})
blocks = [path for (c, path) in tree.maximal_repeats()]
decoded_blocks = [
    (b[0][0],  # start positional-string 
     b[-1][0], # end positional-string
     "".join([c[1] for c in b if type(c) == tuple]) # substring positional-string
     ) for b in blocks
]
    # return decoded_blocks

In [4]:
decoded_blocks
b = blocks[0]
for c in b:
    print(c)

(0, 'C')
(1, 'C')
(2, 'T')
(3, 'T')
(4, 'T')
(5, 'A')
(6, 'A')
(7, 'T')
(8, 'T')


## Intersection of blocks
- number of blocks that intersects
- intersections between blocks

In [11]:
import json 
from pathlib import Path
from src.blocks import Block, BlockAnalyzer

path_blocks = Path("output/max_blocks/Cluster_7464.json")

with open(path_blocks) as fp:
    blocks = json.load(fp)
blocks = [Block(*args) for args in blocks]
blocks

[Block(K=(0, 1), i=0, j=71, label='CTTGTTGTCTGATTATTGATTTTTCGCGAAACCATTTGATCATATGACAAGATGTGTATCCACCTTAACTTA'),
 Block(K=(0, 1, 2), i=1, j=23, label='TTGTTGTCTGATTATTGATTTTT'),
 Block(K=(0, 1, 2), i=25, j=59, label='GCGAAACCATTTGATCATATGACAAGATGTGTATC'),
 Block(K=(0, 1, 2), i=61, j=71, label='ACCTTAACTTA'),
 Block(K=(0, 2), i=61, j=79, label='ACCTTAACTTAATGATTTT'),
 Block(K=(0, 2), i=81, j=81, label='A'),
 Block(K=(0, 2), i=84, j=142, label='AAAATCATTAGGGGATTCATCAGGACTACGCCCCCTCATATCACATGGAAGGTTTATCT')]

In [12]:
block_analyzer = BlockAnalyzer()
block_analyzer(blocks)

{'number_of_blocks': 7, 'blocks_with_overlap': 2, 'inter_between_blocks': 5}

In [14]:
matrix_inter_blocks = block_analyzer._matrix_inter_blocks(blocks)

In [19]:
import numpy as np 
rows = np.where(matrix_inter_blocks.sum(axis=1)>0)[0]
cols = np.where(matrix_inter_blocks.sum(axis=0)>0)[0]

In [21]:
set(rows).union(set(cols))

{0, 1, 2, 3, 4}

In [23]:
rows, cols = np.where(matrix_inter_blocks > 0)

In [24]:
set(rows).union(set(cols))

{0, 1, 2, 3, 4}