In [843]:
import pandas as pd
import numpy as np
import pickle
from Bio import SeqIO
from Bio import Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import matplotlib.pyplot as plt

# Remove recombinations from MSA

+ input: MSA (fa, xmfa), tree, predicted importations 
+ output 1: MSA without recombination
    + fasta
    + phylip
+ output 2: table of protein ids assigned to importations 


# Results for the test sets

## MSA lengths
+ rso-test (fjat) = 3,825,198
+ phy2 = 2,987,190
+ phy4 = 3,219,942
+ phy3 = 3,459,843 (xmfa: 6,947,843)

(for ASM359060v in phy2 dataset the summed length is 116,425)

## Summed recombinations length
+ rso = 92,925
+ phy2 = 290,218
+ phy4 = 58,063
+ phy3 = 120,259

(phy4 strain PSI07 is from Indonesia! very different)

In [844]:
# in
recombis = '/Users/devseeva/Desktop/work/sm_workflow/snakefiles/outputs/rso_test_cfml/rso_test_cfml.importation_status.txt'
path2tree = '/Users/devseeva/Desktop/work/sm_workflow/snakefiles/outputs/rso_test_cfml/rso_test_cfml.labelled_tree.newick'
xm_msa_in = "/Users/devseeva/Desktop/work/sm_workflow/snakefiles/rso_test_concat.xmfa"
fa_msa_in = "/Users/devseeva/Desktop/work/sm_workflow/snakefiles/rso_test_concat.fasta"
accDB = '/Users/devseeva/Desktop/work/sm_workflow/snakefiles/outputs/MSA_ACC_DB_rso_test.pkl'
protDB = '/Users/devseeva/Desktop/work/sm_workflow/snakefiles/outputs/GENOME_PROTEIN_DB_rso_test.pkl'
# out
fa_msa_out = "/Users/devseeva/Desktop/work/sm_workflow/snakefiles/rso_test_concat_no_RR.fasta"
ph_msa_out = "/Users/devseeva/Desktop/work/sm_workflow/snakefiles/rso_test_concat_no_RR.phylip"
updated_recombis = '/Users/devseeva/Desktop/ffffffff.csv'

#recombis = recombis.replace("rso_test", 'phy3_test')
#path2tree = path2tree.replace("rso_test", 'phy3_test')
#msa_in = msa_in.replace("rso_test", 'phy3_test')
#fa_msa_in = fa_msa_in.replace("rso_test", 'phy3_test')
#msa_out = msa_out.replace("rso_test", 'phy3_test')

# Replace the inner nodes with the leaves 

In [845]:
imp_stat = pd.read_csv(recombis, sep='\t')
imp_stat

Unnamed: 0,Node,Beg,End
0,ASM188753v,124124,124179
1,ASM188753v,1020070,1020087
2,ASM188753v,1232622,1232635
3,ASM188753v,1777589,1777601
4,ASM188753v,2133380,2133392
...,...,...,...
799,NODE_9,7647056,7647090
800,NODE_9,7647510,7647589
801,NODE_9,7647970,7648150
802,NODE_9,7649140,7649177


In [846]:
for tree in Phylo.parse(path2tree, 'newick'):
    parents = {}
    # iterate through inner nodes 
    for clade in tree.get_nonterminals(order="level"):
        parents[clade.name] = []
        # get leaves for each inner node
        for child in clade.get_terminals():
            parents[clade.name].append(child.name)
        #parents[clade.name] = ";".join(parents[clade.name])
parents

{'NODE_11': ['ASM1330633',
  'ASM1330693',
  'ASM1330643',
  'ASM1330623',
  'ASM1337573',
  'ASM188753v'],
 'NODE_10': ['ASM1330633',
  'ASM1330693',
  'ASM1330643',
  'ASM1330623',
  'ASM1337573'],
 'NODE_9': ['ASM1330633', 'ASM1330693', 'ASM1330643', 'ASM1330623'],
 'NODE_8': ['ASM1330633', 'ASM1330693', 'ASM1330643'],
 'NODE_7': ['ASM1330633', 'ASM1330693']}

In [847]:
new_len = len(imp_stat)

for p in parents:

    is_node = imp_stat["Node"].str.startswith(p)
    num_child = len(parents[p]) 
    print(p,'has',len(imp_stat[is_node]),'recombinations and',num_child,'children')
    new_len = new_len + len(imp_stat[is_node]) * (num_child - 1)
    
    #imp_stat = imp_stat.append([imp_stat[is_node]]*(num_child-1), ignore_index=True)
    print()
    
    # for each child: duplicate the parent row
    for child in parents[p]:       
        node_imp = imp_stat[is_node].replace({p:child}) 
        imp_stat = imp_stat.append([node_imp], ignore_index=True)
        is_node = imp_stat["Node"].str.startswith(p)

# remove the remaining parent rows

imp_stat = imp_stat.loc[~imp_stat["Node"].str.contains("NODE_")]
print('DF length after parent nodes expantion =', len(imp_stat), '; Must be =', new_len)
assert(new_len == len(imp_stat))

NODE_11 has 0 recombinations and 6 children

NODE_10 has 0 recombinations and 5 children

NODE_9 has 96 recombinations and 4 children

NODE_8 has 137 recombinations and 3 children

NODE_7 has 149 recombinations and 2 children

DF length after parent nodes expantion = 1515 ; Must be = 1515


# Sort and sum the importations

In [848]:
imp_stat = imp_stat.sort_values(by=['Beg', 'End'])
#imp_stat

In [849]:
beg = imp_stat["Beg"].values.copy()
end = imp_stat["End"].values.copy()

In [850]:
ranges2del = []
for i in range(len(imp_stat)):
    ranges2del.extend(range(beg[i]-1, end[i]))
ranges2del = list(dict.fromkeys(ranges2del))
print('Summed recombination length:', len(ranges2del))

Summed recombination length: 92925


# Fasta vs. XMFA positions mapping list

* list of updated positions 
* index represent Fasta sequence positions 
* values represent XMFA sequence positions 

In [851]:
# parse the genes MSAs into vector of the alignments' lengths
aln_length = []
aln = open(xm_msa_in, 'r')

for l in aln:
    
    if l.startswith('>'):
        c = 0
    elif l.startswith('='):
        aln_length.append(c-1) # minus one char for the \n symbol
    else:
        c = c + len(l)
        
print('Found genes MSAs / LCBs:', len(aln_length))    
aln.close()

Found genes MSAs / LCBs: 3842


In [852]:
# inser 1000 count after each gene MSA
index_mapping = list(range(0,aln_length[0]))
sum_lengths = aln_length[0]

for i in range(1,len(aln_length)):  
    index_mapping.extend(range(sum_lengths + i*1000, sum_lengths + aln_length[i] + i*1000))
    sum_lengths = sum_lengths + aln_length[i]
    

In [853]:
print('Fasta length', sum(aln_length), '\nXMFA length', sum(aln_length)+(len(aln_length)-1)*1000)
# index = fasta file length
assert(sum(aln_length) == len(index_mapping))
# values = xmfa file length
assert(sum(aln_length)+(len(aln_length)-1)*1000 == index_mapping[-1]+1) 

Fasta length 3825198 
XMFA length 7666198


# Get the fasta position of each insertion from the xmfa position

+ use ranges to improve the running time of .index(i, start, end)

In [854]:
fa_beg = []
fa_end = []
previous_beg = 0
previous_end = 0

for r_i in range(len(imp_stat)):
    
    # -1 for 0-start indexing
    r_beg = beg[r_i] - 1
    r_end = end[r_i] - 1
    #print(r_beg, r_end)
    
    # USE PREVIOUS_BEG FOR BOTH BECAUSE OF THE SORTING ISSUE
    r_beg_index = index_mapping.index(r_beg, previous_beg, r_beg+1)
    r_end_index = index_mapping.index(r_end, previous_beg, r_end+1)
    
    previous_beg = r_beg_index
    previous_end = r_end_index
    
    fa_beg.append(r_beg_index)
    fa_end.append(r_end_index)
    
    #print("xmfa - fa:",rr, rr_index)

imp_stat['FA Beg'] = fa_beg
imp_stat['FA End'] = fa_end

In [855]:
imp_stat['debug_len_must'] = imp_stat['End'] - imp_stat['Beg']
imp_stat["debug_len_is"] = imp_stat['FA End'] - imp_stat['FA Beg']
assert(len(imp_stat.loc[imp_stat['debug_len_must'] != imp_stat['debug_len_is']]) == 0)
imp_stat

Unnamed: 0,Node,Beg,End,FA Beg,FA End,debug_len_must,debug_len_is
1599,ASM1330633,102393,102447,33392,33446,54,54
1748,ASM1330693,102393,102447,33392,33446,54,54
0,ASM188753v,124124,124179,43123,43178,55,55
1188,ASM1330633,126319,126348,44318,44347,29,29
1325,ASM1330693,126319,126348,44318,44347,29,29
...,...,...,...,...,...,...,...
1324,ASM1330633,7655224,7655293,3817223,3817292,69,69
1461,ASM1330693,7655224,7655293,3817223,3817292,69,69
1598,ASM1330643,7655224,7655293,3817223,3817292,69,69
122,ASM188753v,7657952,7658087,3818951,3819086,135,135


# Delete the recombinations from the fasta file 

In [856]:
fasta_out = open(fa_msa_out, 'w')

for record in SeqIO.parse(fa_msa_in, 'fasta'):
    
    fasta_out.write('>' + record.id + '\n')
    genome_imp_stat = imp_stat.loc[imp_stat['Node'] == record.id]
    beg = genome_imp_stat["FA Beg"].values.copy()
    end = genome_imp_stat["FA End"].values.copy()
    
    ranges2del = []
    for i in range(len(genome_imp_stat)):
        ranges2del.extend(range(beg[i]-1, end[i]))
    ranges2del = list(dict.fromkeys(ranges2del))
    
    print('Genome',record.id,'has',len(ranges2del),'recombinations')
    #print(ranges2del)
    
    print('Number of gaps before',record.seq.count('-'))
    seq = list(record.seq)
    for to_del in ranges2del:
        seq[to_del] = '-'
    seq = ''.join(seq)
    print('Number of gaps after',seq.count('-'))
    print()
    fasta_out.write(seq + '\n')
    
fasta_out.close()

Genome ASM188753v has 17449 recombinations
Number of gaps before 8946
Number of gaps after 26191

Genome ASM1330623 has 26781 recombinations
Number of gaps before 8805
Number of gaps after 35299

Genome ASM1330633 has 55025 recombinations
Number of gaps before 11928
Number of gaps after 66445

Genome ASM1330643 has 41338 recombinations
Number of gaps before 10209
Number of gaps after 51323

Genome ASM1330693 has 55025 recombinations
Number of gaps before 12240
Number of gaps after 66549

Genome ASM1337573 has 8026 recombinations
Number of gaps before 9615
Number of gaps after 17569



# Convert to Phylip

In [857]:
SeqIO.convert(fa_msa_out, "fasta", ph_msa_out, "phylip")

6

# Expand and export the importation_status table

In [890]:
# read Accession Database

with open(accDB, 'rb') as handle:
    acc = pickle.load(handle)
acc.keys()

dict_keys(['ASM188753v', 'ASM1330623', 'ASM1330633', 'ASM1330643', 'ASM1330693', 'ASM1337573'])

In [891]:
# read Protein Database

prot_db = pd.read_pickle(protDB)

rename_g = {}
for col in prot_db.columns:
    rename_g[col] = col.split('.1_')[1][:10]
    
prot_db = prot_db.rename(columns=rename_g)
prot_db

Unnamed: 0,ASM1330693,ASM1330623,ASM1330643,ASM188753v,ASM1330633,ASM1337573
WP_071507848.1,"{'chr': 'NZ_CP052126.1', 'location': (0, 1, 2,...",,,,"{'chr': 'NZ_CP052086.1', 'location': (0, 1, 2,...",
WP_013210791.1,"{'chr': 'NZ_CP052126.1', 'location': (1846, 18...",,,,"{'chr': 'NZ_CP052086.1', 'location': (1846, 18...",
WP_173941386.1,"{'chr': 'NZ_CP052126.1', 'location': (3098, 30...",,,,"{'chr': 'NZ_CP052086.1', 'location': (3098, 30...",
WP_173941387.1,"{'chr': 'NZ_CP052126.1', 'location': (6305, 63...",,,,"{'chr': 'NZ_CP052086.1', 'location': (6305, 63...",
WP_086005422.1,"{'chr': 'NZ_CP052127.1', 'location': (666300, ...",,"{'chr': 'NZ_CP052097.1', 'location': (693036, ...","{'chr': 'NZ_CP016555.1', 'location': (1164461,...","{'chr': 'NZ_CP052087.1', 'location': (666300, ...","{'chr': 'NZ_CP056086.1', 'location': (1889533,..."
...,...,...,...,...,...,...
WP_016726112.1,,,,,,"{'chr': 'NZ_CP056086.1', 'location': (1945036,..."
WP_028861767.1,,,,,,"{'chr': 'NZ_CP056086.1', 'location': (1973916,..."
WP_023470131.1,,,,,,"{'chr': 'NZ_CP056086.1', 'location': (1980032,..."
WP_043885778.1,,,,,,"{'chr': 'NZ_CP056086.1', 'location': (1985998,..."


for k in acc.keys():
    g2ranges = imp_stat.loc[imp_stat['Node'] == k]
    
    g_beg = g2ranges["FA Beg"].values.copy()
    g_end = g2ranges["FA End"].values.copy()
    
    print(k)
    for i in range(len(g_beg)):
        print(set(acc[k][g_beg[i]:g_end[i]]))

In [892]:
# assign improtations to proteins

g_beg = imp_stat["FA Beg"].values.copy()
g_end = imp_stat["FA End"].values.copy()
g_node = imp_stat["Node"].values.copy()

import_acc = []
import_products = []
for i in range(len(g_beg)):
    import_acc.append(set(acc[g_node[i]][g_beg[i]:g_end[i]]))
    
    products = []
    for a in import_acc[-1]:
        products.append(prot_db.loc[a, g_node[i]]['product'])
    import_products.append(products)
    
imp_stat['Prot-ACC'] = import_acc
imp_stat['Product'] = import_products

In [894]:
imp_stat.to_csv(updated_recombis, index=False)
imp_stat

Unnamed: 0,Node,Beg,End,FA Beg,FA End,debug_len_must,debug_len_is,Prot-ACC,Product
1599,ASM1330633,102393,102447,33392,33446,54,54,{WP_058907956.1},[[Asp-tRNA(Asn)/Glu-tRNA(Gln) amidotransferase...
1748,ASM1330693,102393,102447,33392,33446,54,54,{WP_058907956.1},[[Asp-tRNA(Asn)/Glu-tRNA(Gln) amidotransferase...
0,ASM188753v,124124,124179,43123,43178,55,55,{WP_011000042.1},[[16S rRNA (cytosine(967)-C(5))-methyltransfer...
1188,ASM1330633,126319,126348,44318,44347,29,29,{WP_058907963.1},[[response regulator]]
1325,ASM1330693,126319,126348,44318,44347,29,29,{WP_058907963.1},[[response regulator]]
...,...,...,...,...,...,...,...,...,...
1324,ASM1330633,7655224,7655293,3817223,3817292,69,69,{WP_173941733.1},[[BapA prefix-like domain-containing protein]]
1461,ASM1330693,7655224,7655293,3817223,3817292,69,69,{WP_173952290.1},[[BapA prefix-like domain-containing protein]]
1598,ASM1330643,7655224,7655293,3817223,3817292,69,69,{WP_119889930.1},[[BapA prefix-like domain-containing protein]]
122,ASM188753v,7657952,7658087,3818951,3819086,135,135,{WP_155773186.1},[[filamentous hemagglutinin N-terminal domain-...


# TODO {'WP_071623447.1'} seq length is less than improtation length?
# TODO enrichment with probabilities 
e.g. numOf(T3 effectors with recombinations)/numOf(all T3 effectors in the genome)

In [896]:
#prot_db.loc["WP_071507848.1","ASM1330693"]