In [18]:
import pandas as pd
import os
import re
from Bio import SeqIO

os.chdir('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_E104_v1/blastp_on_p')

os.listdir()
hit_df =''
files = os.listdir()
outfmt6 = [x for x in files if x.split(".")[-1] == 'alloutfmt6' and 'anno' in x]
fa_files = [x for x in files if ('protein' in x) and (x.split(".")[-1] == 'fa')]

outfmt6

#The next block should pull in both the initial protein and the blast df.
#The initial protein should become a dataframe that contains proteins sequence name and length.
#This df should be merged with the blast df in a way that proteins without hit should get NA values. 
#Once this is done make two arrays with [p, h], sort this and compare, pull out everything that is identical, and lable it with a new column reverse blast Yes/No.
#Pull out YES and see if they are enriched/depelted in something. NOs need to be checked for high coverage in ph vs h/p mapping and levels of heterozycosity + h on p mapping mappings. 

#read in protein ids for p and h contigs and store names in a list in a dict with unique key id [first part of
#file name].
fa_protein_dict = {}
fa_protein_length_dict = {}
for file in fa_files:
    seq_list = []
    length_list =[]
    for seq in SeqIO.parse(open(file), 'fasta'):
        seq_list.append(seq.id)
        length_list.append(len(seq.seq))
    key_name = file.split('.')[0]
    fa_protein_dict[key_name] = seq_list
    fa_protein_length_dict[key_name] = dict(zip(seq_list, length_list))

#generate df dict of blast output and filter blast output
header = ['Query', 'Target', 'PctID', 'AlnLgth', 'NumMis', 'NumGap', 'StartQuery', 'StopQuery', 'StartTarget',\
              'StopTarget', 'e-value','BitScore']
outfmt6_dict ={}
#match_dict = {} #get best hits in match_dict[p_protein] = h_protein
hit_df = pd.DataFrame(columns=['p_protein', 'h_protein'])
for outfile in outfmt6:
    key_name =  outfile.split('.')[0]
    df = ''
    df = pd.read_csv(outfile, header = None, names = header, sep='\t')
    #add the query length using to the df using the length dict generated before
    df["QLgth"] = df["Query"].apply(lambda x: fa_protein_length_dict[key_name][x]) 
    df["QCov"] = df['AlnLgth']/df['QLgth']*100 #calculate the % coverage for each querry
    df = df[(df['QCov'] > 30) & (df['PctID'] > 50) ] #define paralogous as Query coverage > 30% and PctID > 50
    #this could be more dynamic and the outfmt of blast AlnLngthPct and they greater than 60%
    groups = df.groupby(by='Query')
    #now filter the dataframe by the smallest e-value for each group == Query
    df_filtered = groups.apply(lambda g: g[g['e-value'] == g['e-value'].min()]) 
    df_filtered = df_filtered.reset_index(drop=True)
    #in case there is a blast query that hits the same subject twice with the same minimal e-value
    df_filtered = df_filtered.drop_duplicates(subset=['Query', 'Target'], keep ='last')
    outfmt6_dict[key_name] = df_filtered
    if 'p_ctg' in key_name:
        df_filtered['h_protein'] = df_filtered['Target']
        df_filtered['p_protein'] = df_filtered['Query']
    if 'h_ctg' in key_name:
        df_filtered['h_protein'] = df_filtered['Query']
        df_filtered['p_protein'] = df_filtered['Target']
    hit_df = pd.concat([hit_df, df_filtered.loc[:, ['p_protein', 'h_protein']]])

#duplicates are besties as they are entered twice from both outfmt

bestie_df = hit_df[hit_df.duplicated(keep='first')]

bestie_df.to_csv(list(outfmt6_dict.keys())[0][:-6] + '.besties.txt', sep='\t', header=None, index=None)
bestie_df['p_protein'].to_csv(list(outfmt6_dict.keys())[0] + '.besties.txt', sep='\t', header=None, index=None)
bestie_df['h_protein'].to_csv(list(outfmt6_dict.keys())[1] + '.besties.txt', sep='\t', header=None, index=None)

no_hits ={}
for key in fa_protein_dict.keys():
    if 'p_' in key:
        no_hits[key] = set(fa_protein_dict[key]) - set(bestie_df['p_protein'])
        pd.DataFrame(list(no_hits[key])).to_csv(key + '.p_proteins.no_besties.txt', sep='\t', header=None, index=None)
    if 'h_' in key:
        no_hits[key] = set(fa_protein_dict[key]) - set(bestie_df['h_protein'])
        pd.DataFrame(list(no_hits[key])).to_csv(key + '.h_proteins.no_besties.txt', sep='\t', header=None, index=None)

_len_out = 0
_len_pro = 0
for x in fa_protein_dict.keys():
    _len_pro += len(fa_protein_dict[x])
    _len_out += len(no_hits[x])
_len_out += (len(bestie_df))*2 - bestie_df.duplicated(subset="p_protein", keep='last').sum() \
- bestie_df.duplicated(subset="h_protein", keep='last').sum()
_len_out == _len_pro


True

In [17]:
_len_out = 0
_len_pro = 0
for x in fa_protein_dict.keys():
    _len_pro += len(fa_protein_dict[x])
    _len_out += len(no_hits[x])
    print("Out of %i %i have no reciprocal blast hit for %s making it %.2f percent."%(len(fa_protein_dict[x]),len(no_hits[x]),x, (len(no_hits[x])/len(fa_protein_dict[x])*100) ))
_len_out += (len(bestie_df))*2 - bestie_df.duplicated(subset="p_protein", keep='last').sum() \
- bestie_df.duplicated(subset="h_protein", keep='last').sum()
_len_out == _len_pro

Out of 15949 4809 have no reciprocal blast hit for Pst_E104_v1_p_ctg making it 30.15 percent.
Out of 14321 2974 have no reciprocal blast hit for Pst_E104_v1_h_ctg making it 20.77 percent.


True

In [None]:
#now pull gene sequences for no-besties and do blast on the corresponding other haplotype

In [19]:
def blast_outfmt6_to_bed(x):
    blast_fo = open(x, 'r')
    blast_lines = blast_fo.readlines()
    bed_file_name = x + '.bed'
    bed_fo = open(bed_file_name, 'w+')
    for l in blast_lines:
        content = l.split('\t')
        if int(content[8]) - int(content[9]) < 1:
            print(content[1], int(content[8]) -1, content[9], content[0], content[10], "+", sep="\t", file=bed_fo) 
        else:
            print(content[1], int(content[9]) -1, content[8],  content[0], content[10], "-", sep = "\t", file=bed_fo)
    blast_fo.close()
    bed_fo.close()

In [20]:
from Bio import SeqIO

folder_p = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_E104_v1/blastp_on_p/'

no_besties = [x for x in os.listdir(folder_p) if 'no_besties.txt' in x]
gene_files = [x for x in os.listdir(folder_p) if '.gene.' in x and '.fa' in x]
no_besties.sort()
gene_files.sort()

#simply pulls in the gene sequences of missing besties
for no_b, gene_file in zip(no_besties, gene_files):
    no_bestie_list = pd.read_csv(folder_p+no_b, header=None, sep='\t')[0].tolist()
    no_bestie_list = [x.replace('evm.model', 'evm.TU') for x in no_bestie_list]
    no_bestie_seq = []
    for seq in SeqIO.parse(open(folder_p + gene_file), 'fasta'):
        if seq.id in no_bestie_list:
            no_bestie_seq.append(seq)
    out_f = folder_p + no_b[:-3].replace('protein', 'gene') + 'fa'
    f_handle = open(out_f,'w') #need to generate handle for writing and
    SeqIO.write(no_bestie_seq, f_handle, 'fasta')
    f_handle.close() #closing file afterwards again

gene_files_no_besties = [x for x in os.listdir(folder_p) if 'gene' in x and '.fa' in x and 'no_besties.fa' in x and 'outfmt6' not in x]
blast_db_nt = [x for x in os.listdir(folder_p) if '_ctg.fa' in x and x[-3:] == '.fa']
gene_files_no_besties.sort()
blast_db_nt.sort()

os.chdir(folder_p)

!blastn -db {blast_db_nt[1]} -query {gene_files_no_besties[0]}  -outfmt 6 -evalue 1e-10 -num_threads 1 \
> {gene_files_no_besties[0]}.outfmt6


in_file =gene_files_no_besties[0]+'.outfmt6'
blast_outfmt6_to_bed(in_file)

!blastn -db {blast_db_nt[0]} -query {gene_files_no_besties[1]}  -outfmt 6 -evalue 1e-10 -num_threads 1 \
> {gene_files_no_besties[1]}.outfmt6


in_file =gene_files_no_besties[1]+'.outfmt6'
blast_outfmt6_to_bed(in_file)

In [21]:
no_besties_blast_nt_bed = [x for x in os.listdir(folder_p) if x.endswith('no_besties.fa.outfmt6.bed') ]
no_besties_blast_nt_bed.sort()
outfmt6.sort()

In [22]:
outfmt6

['Pst_E104_v1_h_ctg.anno.RepaseTPSI_filtered.protein.h_on_p.blast.alloutfmt6',
 'Pst_E104_v1_p_ctg.anno.RepaseTPSI_filtered.protein.p_on_h.blast.alloutfmt6']

In [23]:
no_besties_blast_nt_bed

['Pst_E104_v1_h_ctg.h_genes.no_besties.fa.outfmt6.bed',
 'Pst_E104_v1_p_ctg.p_genes.no_besties.fa.outfmt6.bed']

In [30]:
for no_bbb, protein_blast in zip(no_besties_blast_nt_bed,outfmt6):
    no_bbb_df_header = ['Contig', 'start', 'end', 'blast_hit', 'e-value', 'strand']
    no_bbb_df = pd.read_csv(folder_p+no_bbb, header=None, names=no_bbb_df_header,  sep='\t')
    protein_blast_df = pd.read_csv(folder_p+protein_blast, header=None, sep='\t')
    no_bbb_no_protein_blast_df = no_bbb_df[~no_bbb_df['blast_hit'].isin(protein_blast_df[0])]
    groups = no_bbb_no_protein_blast_df.groupby(by='blast_hit')
    #now filter the dataframe by the smallest e-value for each group == blast_hit
    df_filtered = groups.apply(lambda g: g[g['e-value'] == g['e-value'].min()])
    df_filtered = df_filtered.reset_index(drop=True)
    df_filtered.to_csv(folder_p+no_bbb[:-4]+'.filteredbesthits.bed', sep='\t', header=None, index=None)

Unnamed: 0,Contig,start,end,blast_hit,e-value,strand
0,hcontig_000_031,17117,17600,evm.TU.pcontig_000.10,0.0,-
1,hcontig_000_003,43542,44025,evm.TU.pcontig_000.10,0.0,-
2,hcontig_000_003,489890,490460,evm.TU.pcontig_000.113,0.0,+
3,hcontig_000_003,490669,491042,evm.TU.pcontig_000.114,5e-173,+
4,hcontig_017_013,53223,53603,evm.TU.pcontig_000.118,3e-161,-


In [31]:
from pybedtools import BedTool

In [33]:
#maybe better to do this feature 
no_besties_h_p_blast_hits = BedTool('Pst_E104_v1_h_ctg.h_genes.no_besties.fa.outfmt6.filteredbesthits.bed')

In [34]:
no_besties_h_p_blast_hits.head()

pcontig_000	38121	38929	evm.TU.hcontig_000_003.11	0.0	-
 pcontig_006	564497	566065	evm.TU.hcontig_000_003.125	0.0	+
 pcontig_046	26236	27067	evm.TU.hcontig_000_003.137	0.0	+
 pcontig_000	1849105	1849938	evm.TU.hcontig_000_003.137	0.0	-
 pcontig_095	182659	183492	evm.TU.hcontig_000_003.137	0.0	-
 pcontig_115	105346	106177	evm.TU.hcontig_000_003.137	0.0	+
 pcontig_021	331101	331847	evm.TU.hcontig_000_003.137	0.0	+
 pcontig_021	78429	78929	evm.TU.hcontig_000_003.137	0.0	+
 pcontig_000	498148	498837	evm.TU.hcontig_000_003.138	0.0	+
 pcontig_000	493698	494516	evm.TU.hcontig_000_003.139	0.0	+
 

In [37]:
p_genes = BedTool('Pst_E104_v1_p_ctg.gene.RepaseTPSI_filtered.gff3')

In [38]:
p_genes.head()

pcontig_241	EVM	evm.TU.pcontig_241.2	13967	14266	.	+	.	ID=evm.TU.pcontig_241.2;Name=EVM%20prediction%2pcontig_0241.2
 pcontig_241	EVM	evm.TU.pcontig_241.1	4229	4450	.	-	.	ID=evm.TU.pcontig_241.1;Name=EVM%20prediction%2pcontig_0241.1
 pcontig_193	EVM	evm.TU.pcontig_193.6	5386	5862	.	+	.	ID=evm.TU.pcontig_193.6;Name=EVM%20prediction%2pcontig_0193.6
 pcontig_225	EVM	evm.TU.pcontig_225.7	22308	22664	.	+	.	ID=evm.TU.pcontig_225.7;Name=EVM%20prediction%2pcontig_0225.7
 pcontig_225	EVM	evm.TU.pcontig_225.6	18672	21163	.	-	.	ID=evm.TU.pcontig_225.6;Name=EVM%20prediction%2pcontig_0225.6
 pcontig_225	EVM	evm.TU.pcontig_225.1	2067	2309	.	+	.	ID=evm.TU.pcontig_225.1;Name=EVM%20prediction%2pcontig_0225.1
 pcontig_225	EVM	evm.TU.pcontig_225.3	3999	5386	.	+	.	ID=evm.TU.pcontig_225.3;Name=EVM%20prediction%2pcontig_0225.3
 pcontig_225	EVM	evm.TU.pcontig_225.5	17272	18147	.	+	.	ID=evm.TU.pcontig_225.5;Name=EVM%20prediction%2pcontig_0225.5
 pcontig_225	EVM	evm.TU.pcontig_225.2	2658	2883	.	+	.	ID=evm.TU.p

In [88]:
no_besties_h_p_gene_intersect = no_besties_h_p_blast_hits.intersect(p_genes, wb=True, f=1)

In [89]:
no_besties_h_p_gene_intersect.saveas('Test.bed')

<BedTool(Test.bed)>

In [90]:
test_df = pd.read_csv('Test.bed', header=None, sep='\t')

In [91]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,pcontig_046,26236,27067,evm.TU.hcontig_000_003.137,0.0,+,pcontig_046,EVM,evm.TU.pcontig_046.8,26162,27116,.,+,.,ID=evm.TU.pcontig_046.8;Name=EVM%20prediction%...
1,pcontig_000,1849105,1849938,evm.TU.hcontig_000_003.137,0.0,-,pcontig_000,EVM,evm.TU.pcontig_000.414,1849068,1850091,.,-,.,ID=evm.TU.pcontig_000.414;Name=EVM%20predictio...
2,pcontig_095,182659,183492,evm.TU.hcontig_000_003.137,0.0,-,pcontig_095,EVM,evm.TU.pcontig_095.46,182622,183765,.,-,.,ID=evm.TU.pcontig_095.46;Name=EVM%20prediction...
3,pcontig_115,105346,106177,evm.TU.hcontig_000_003.137,0.0,+,pcontig_115,EVM,evm.TU.pcontig_115.23,105344,106177,.,+,.,ID=evm.TU.pcontig_115.23;Name=EVM%20prediction...
4,pcontig_021,78429,78929,evm.TU.hcontig_000_003.137,0.0,+,pcontig_021,EVM,evm.TU.pcontig_021.23,78157,79006,.,+,.,ID=evm.TU.pcontig_021.23;Name=EVM%20prediction...


In [92]:
test_df["LenBlast"] = abs(test_df[1] - test_df[2])
test_df["Lenfeature"] = abs(test_df[9]-test_df[10])

In [97]:
test_df['Diff'] = abs(test_df['LenBlast'] -test_df['Lenfeature'])

In [102]:
test_df[(test_df['Diff'] == 0)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,LenBlast,Lenfeature,Diff
176,pcontig_001,441768,443358,evm.TU.hcontig_001_123.9,0.0,+,pcontig_001,EVM,evm.TU.pcontig_001.110,441769,443359,.,+,.,ID=evm.TU.pcontig_001.110;Name=EVM%20predictio...,1590,1590,0
212,pcontig_149,52646,53547,evm.TU.hcontig_002_011.98,0.0,+,pcontig_149,EVM,evm.TU.pcontig_149.15,52647,53548,.,+,.,ID=evm.TU.pcontig_149.15;Name=EVM%20prediction...,901,901,0
213,pcontig_173,34166,35067,evm.TU.hcontig_002_011.98,0.0,+,pcontig_173,EVM,evm.TU.pcontig_173.12,34167,35068,.,+,.,ID=evm.TU.pcontig_173.12;Name=EVM%20prediction...,901,901,0
214,pcontig_200,18666,19567,evm.TU.hcontig_002_011.98,0.0,+,pcontig_200,EVM,evm.TU.pcontig_200.5,18667,19568,.,+,.,ID=evm.TU.pcontig_200.5;Name=EVM%20prediction%...,901,901,0
215,pcontig_067,362890,363791,evm.TU.hcontig_002_011.98,0.0,+,pcontig_067,EVM,evm.TU.pcontig_067.87,362891,363792,.,+,.,ID=evm.TU.pcontig_067.87;Name=EVM%20prediction...,901,901,0
498,pcontig_033,364554,365450,evm.TU.hcontig_005_043.145,0.0,+,pcontig_033,EVM,evm.TU.pcontig_033.83,364555,365451,.,+,.,ID=evm.TU.pcontig_033.83;Name=EVM%20prediction...,896,896,0
514,pcontig_033,364554,365450,evm.TU.hcontig_005_043.154,0.0,+,pcontig_033,EVM,evm.TU.pcontig_033.83,364555,365451,.,+,.,ID=evm.TU.pcontig_033.83;Name=EVM%20prediction...,896,896,0
925,pcontig_017,326126,327512,evm.TU.hcontig_014_004.47,0.0,+,pcontig_017,EVM,evm.TU.pcontig_017.65,326127,327513,.,+,.,ID=evm.TU.pcontig_017.65;Name=EVM%20prediction...,1386,1386,0
926,pcontig_017,118393,119768,evm.TU.hcontig_014_004.47,0.0,+,pcontig_017,EVM,evm.TU.pcontig_017.31,118394,119769,.,+,.,ID=evm.TU.pcontig_017.31;Name=EVM%20prediction...,1375,1375,0
927,pcontig_033,1118135,1119528,evm.TU.hcontig_014_004.47,0.0,+,pcontig_033,EVM,evm.TU.pcontig_033.257,1118136,1119529,.,+,.,ID=evm.TU.pcontig_033.257;Name=EVM%20predictio...,1393,1393,0


In [95]:
test_df.groupby(by=3).apply(lambda g: g[g['e-value'] == g['e-value'].min()]) 

KeyError: 'e-value'

In [72]:
len(no_besties_h_p_gene_intersect)

3079

In [62]:
len(no_besties_h_p_blast_hits)

16797

In [52]:
no_besties_h_p_gene_intersect[0]

Interval(pcontig_000:38121-38929)