## Analysis: old experiment vs new experiment
Algorithm for computing maximal blocks has changed


In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# time max blocks
def load_times(name_msa):
    path_times = [f"output/block_decomposition/{name_msa}.txt",f"output/max_blocks/{name_msa}.txt"]
    times = {}

    for path_time in path_times:
        with open(path_time) as fp:
            for line in fp.readlines():
                name, time = line.replace("\n","").split("\t")
                times[name] = float(time)
    
    return times


Index(['Unnamed: 0', 'timestamp_x', 'path_msa', 'path_blocks_x', 'n_seqs_x',
       'n_unique_seqs_x', 'n_cols_x', 'n_max_blocks_x', 't_x',
       'blocks_with_overlap_x', 'inter_between_blocks_x', 'timestamp_y',
       'path_blocks_y', 'n_seqs_y', 'n_unique_seqs_y', 'n_cols_y',
       'n_max_blocks_y', 't_y', 'blocks_with_overlap_y',
       'inter_between_blocks_y', 'diff-max-sub', 'name_msa'],
      dtype='object')

### old experiments
- `y`: maximal blocks from sub-alignments
- `x`: maximal blocks from the MSA

In [6]:
old_experiments = pd.read_csv("../data/output-old-experiments.tsv", sep="\t")
old_experiments["name_msa"] = old_experiments["path_msa"].apply(lambda path_msa: Path(path_msa).stem)
old_experiments.columns

Index(['Unnamed: 0', 'timestamp_x', 'path_msa', 'path_blocks_x', 'n_seqs_x',
       'n_unique_seqs_x', 'n_cols_x', 'n_max_blocks_x', 't_x',
       'blocks_with_overlap_x', 'inter_between_blocks_x', 'timestamp_y',
       'path_blocks_y', 'n_seqs_y', 'n_unique_seqs_y', 'n_cols_y',
       'n_max_blocks_y', 't_y', 'blocks_with_overlap_y',
       'inter_between_blocks_y', 'diff-max-sub', 'name_msa'],
      dtype='object')

### regarding maximal blocks
- `total_blocks`: number of maximal blocks in the MSA
- `blocks_with_overlap`: blocks with at least one overlap
- `inter_between_blocks`: number of intersections between pairs of blocks (that has to be decomposed)

In [3]:
# maximal blocks
stats_max_blocks = pd.concat(
                            [pd.read_csv(csv, sep="\t", index_col=False) for csv in Path("output/max_blocks/stats").rglob("*.tsv")],
                            axis=0
                            )
stats_max_blocks["name_msa"] = stats_max_blocks["filename"].apply(lambda filename: Path(filename).stem)
stats_max_blocks


Unnamed: 0,timestamp,filename,total_blocks,blocks_with_overlap,inter_between_blocks,name_msa
0,Fri Dec 9 21:23:50 2022,output/max_blocks/Cluster_13405.json,2,0,0,Cluster_13405
0,Fri Dec 9 21:23:51 2022,output/max_blocks/Cluster_13063.json,4,3,4,Cluster_13063
0,Fri Dec 9 21:23:53 2022,output/max_blocks/Cluster_8584.json,1,0,0,Cluster_8584
0,Fri Dec 9 21:23:19 2022,output/max_blocks/Cluster_4991.json,128,127,4214,Cluster_4991
0,Fri Dec 9 21:23:49 2022,output/max_blocks/Cluster_13250.json,2,0,0,Cluster_13250
...,...,...,...,...,...,...
0,Fri Dec 9 21:23:51 2022,output/max_blocks/Cluster_7464.json,7,2,5,Cluster_7464
0,Fri Dec 9 21:23:52 2022,output/max_blocks/GC00006096.json,2,0,0,GC00006096
0,Fri Dec 9 21:23:28 2022,output/max_blocks/GC00005757_2.json,53,51,328,GC00005757_2
0,Fri Dec 9 21:23:28 2022,output/max_blocks/Cluster_13342.json,7,5,12,Cluster_13342


### regarding times
The computation of maximal blocks is as follow: 
1. create a suffix tree as suggested in Alanko's paper
2. find maximal repeats, which will be positional strings for us
3. map the positional strings back to the MSA to identify the maximal blocks
4. decompose the set of maximal blocks to find the set $\mathcal{B}$

then, the reported times are: 
- `t_pos_string`: time to compute positional strings (step 1)
- `t_max_blocks`: time to find maximal blocks (step 2)
- `t_decomp_blocks`: time to decompose pairs of blocks (step 3)

In [12]:
# block decomposition
stats_block_decomp = pd.concat(
                            [pd.read_csv(csv, sep="\t", index_col=False) for csv in Path("output/block_decomposition/stats").rglob("*.tsv")],
                            axis=0
                            )
stats_block_decomp["name_msa"] = stats_block_decomp["filename"].apply(lambda filename: Path(filename).stem)
stats_block_decomp[["t_decomp_blocks","t_pos_string","t_max_blocks"]] = stats_block_decomp["name_msa"].apply(
    lambda name_msa: pd.Series(load_times(name_msa)))

# to avoid confusion with the total of maximal blocks in stats_max_blocks
stats_block_decomp.rename({"total_blocks": "size_set_B"}, axis=1, inplace=True)

In [13]:
stats_block_decomp.query("name_msa == 'Cluster_13063'")

Unnamed: 0,timestamp,filename,size_set_B,name_msa,t_decomp_blocks,t_pos_string,t_max_blocks
0,Fri Dec 9 21:23:51 2022,output/block_decomposition/Cluster_13063.json,7,Cluster_13063,0.000731,0.0219,0.0


In [21]:
## compare
cols_old = ["name_msa","n_unique_seqs_x","n_seqs_x","n_cols_x","t_x","t_y"] # old experiments
cols_smb = ["name_msa","total_blocks"]#"blocks_with_overlap","inter_between_blocks"] # stats max blocks
cols_sbd = ["name_msa","size_set_B","t_pos_string","t_max_blocks","t_decomp_blocks"]

In [22]:
stats_new_exp = pd.merge(stats_max_blocks[cols_smb],stats_block_decomp[cols_sbd], on="name_msa")

In [23]:
stats_new_exp

Unnamed: 0,name_msa,total_blocks,size_set_B,t_pos_string,t_max_blocks,t_decomp_blocks
0,Cluster_13405,2,2,0.0103,0.0000,0.000324
1,Cluster_13063,4,7,0.0219,0.0000,0.000731
2,Cluster_8584,1,1,0.0521,0.0000,0.000356
3,Cluster_4991,128,1293,0.7913,0.0043,0.417854
4,Cluster_13250,2,2,0.0164,0.0000,0.000484
...,...,...,...,...,...,...
164,Cluster_7464,7,13,0.0700,0.0000,0.000942
165,GC00006096,2,2,0.0566,0.0000,0.000433
166,GC00005757_2,53,262,0.3399,0.0003,0.022772
167,Cluster_13342,7,15,0.0231,0.0002,0.001554


In [27]:
stats_old_exp = old_experiments[cols_old].rename({
    "t_x": "t_max_blocks_msa",
    "t_y": "t_max_blocks_sub",
    "n_unique_seqs_x": "n_unique_seqs",
    "n_seqs_x": "n_seqs_msa",
    "n_cols_x": "n_cols_msa"

}, axis=1)

In [28]:
comparison = pd.merge(stats_old_exp, stats_new_exp, on="name_msa")

In [32]:
# time to compute maximal blocks
# old time / new time
comparison["t_max_blocks_old_over_new"] = comparison.apply(lambda row: row["t_max_blocks_msa"] / (row["t_max_blocks"] + row["t_pos_string"]) ,axis=1)  

In [38]:
comparison.describe()

Unnamed: 0,n_unique_seqs,n_seqs_msa,n_cols_msa,t_max_blocks_msa,t_max_blocks_sub,total_blocks,size_set_B,t_pos_string,t_max_blocks,t_decomp_blocks,t_max_blocks_old_over_new
count,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0
mean,5.621302,70.710059,128.976331,20.332148,41.860547,26.615385,195.390533,0.175944,0.000692,0.040332,93.371789
std,4.637941,72.176734,58.024108,28.531255,91.060377,36.649401,426.182698,0.216977,0.001467,0.12278,38.294691
min,2.0,2.0,31.0,0.291457,0.030802,1.0,1.0,0.0077,0.0,0.000293,31.406536
25%,2.0,10.0,72.0,2.420653,0.647055,4.0,5.0,0.0353,0.0,0.000506,56.855859
50%,4.0,34.0,136.0,7.811453,5.150988,11.0,26.0,0.0875,0.0001,0.002259,94.294423
75%,7.0,143.0,183.0,25.880495,32.42943,33.0,137.0,0.2322,0.0007,0.014839,124.1235
max,24.0,308.0,216.0,180.123356,622.211261,197.0,2695.0,1.1603,0.0125,1.055623,188.505544


In [42]:
max_ncols = comparison["n_cols_msa"].max()
comparison.query(f"`n_cols_msa`=={max_ncols}")

Unnamed: 0,name_msa,n_unique_seqs,n_seqs_msa,n_cols_msa,t_max_blocks_msa,t_max_blocks_sub,total_blocks,size_set_B,t_pos_string,t_max_blocks,t_decomp_blocks,t_max_blocks_old_over_new
75,Cluster_4804,7,46,216,56.757239,5.285068,29,157,0.3984,0.0003,0.019331,142.355753
84,GC00001613,24,307,216,180.123356,94.750111,197,2695,1.1603,0.0125,1.055623,153.584035
87,GC00003934_5,2,2,216,11.536539,19.635494,13,13,0.0612,0.0,0.000745,188.505544


In [41]:
max_nrows = comparison["n_unique_seqs"].max()
comparison.query(f"`n_unique_seqs`=={max_nrows}")

Unnamed: 0,name_msa,n_unique_seqs,n_seqs_msa,n_cols_msa,t_max_blocks_msa,t_max_blocks_sub,total_blocks,size_set_B,t_pos_string,t_max_blocks,t_decomp_blocks,t_max_blocks_old_over_new
84,GC00001613,24,307,216,180.123356,94.750111,197,2695,1.1603,0.0125,1.055623,153.584035
