In [4]:
import pandas as pd
import os
import re
from Bio import SeqIO

os.chdir('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_E104_v1/blastp_on_p')

os.listdir()
hit_df =''
files = os.listdir()
outfmt6 = [x for x in files if x.endswith('blast.alloutfmt6') and 'anno' in x]
print(outfmt6)
fa_files = [x for x in files if x.endswith('RepaseTPSI_filtered.protein.fa')]
print(fa_files)

#The next block should pull in both the initial protein and the blast df.
#The initial protein should become a dataframe that contains proteins sequence name and length.
#This df should be merged with the blast df in a way that proteins without hit should get NA values. 
#Once this is done make two arrays with [p, h], sort this and compare, pull out everything that is identical, and lable it with a new column reverse blast Yes/No.
#Pull out YES and see if they are enriched/depelted in something. NOs need to be checked for high coverage in ph vs h/p mapping and levels of heterozycosity + h on p mapping mappings. 

#read in protein ids for p and h contigs and store names in a list in a dict with unique key id [first part of
#file name].
fa_protein_dict = {}
fa_protein_length_dict = {}
for file in fa_files:
    seq_list = []
    length_list =[]
    for seq in SeqIO.parse(open(file), 'fasta'):
        seq_list.append(seq.id)
        length_list.append(len(seq.seq))
    key_name = file.split('.')[0]
    fa_protein_dict[key_name] = seq_list
    fa_protein_length_dict[key_name] = dict(zip(seq_list, length_list))

#generate df dict of blast output and filter blast output
header = ['Query', 'Target', 'PctID', 'AlnLgth', 'NumMis', 'NumGap', 'StartQuery', 'StopQuery', 'StartTarget',\
              'StopTarget', 'e-value','BitScore']
outfmt6_dict ={} #contains the filtered values
outfmt6_dict_all = {} #contains the unfiltered blast hits e.g. low % identity and low query coverage
#match_dict = {} #get best hits in match_dict[p_protein] = h_protein
hit_df = pd.DataFrame(columns=['p_protein', 'h_protein'])
for outfile in outfmt6:
    key_name =  outfile.split('.')[0]
    df = ''
    df = pd.read_csv(outfile, header = None, names = header, sep='\t')
    #add the query length using to the df using the length dict generated before
    df["QLgth"] = df["Query"].apply(lambda x: fa_protein_length_dict[key_name][x]) 
    df["QCov"] = df['AlnLgth']/df['QLgth']*100 #calculate the % coverage for each querry
    outfmt6_dict_all[key_name] = df
    df = df[(df['QCov'] > 30) & (df['PctID'] > 50) ] #define paralogous as Query coverage > 30% and PctID > 50
    #this could be more dynamic and the outfmt of blast AlnLngthPct and they greater than 60%
    groups = df.groupby(by='Query')
    #now filter the dataframe by the smallest e-value for each group == Query
    df_filtered = groups.apply(lambda g: g[g['e-value'] == g['e-value'].min()]) 
    df_filtered = df_filtered.reset_index(drop=True)
    #in case there is a blast query that hits the same subject twice with the same minimal e-value
    df_filtered = df_filtered.drop_duplicates(subset=['Query', 'Target'], keep ='last')
    outfmt6_dict[key_name] = df_filtered
    if 'p_ctg' in key_name:
        df_filtered['h_protein'] = df_filtered['Target']
        df_filtered['p_protein'] = df_filtered['Query']
    if 'h_ctg' in key_name:
        df_filtered['h_protein'] = df_filtered['Query']
        df_filtered['p_protein'] = df_filtered['Target']
    hit_df = pd.concat([hit_df, df_filtered.loc[:, ['p_protein', 'h_protein']]])

#duplicates are besties as they are entered twice from both outfmt

bestie_df = hit_df[hit_df.duplicated(keep='first')]

bestie_df.to_csv(list(outfmt6_dict.keys())[0][:-6] + '.besties.txt', sep='\t', header=None, index=None)
bestie_df['p_protein'].to_csv(list(outfmt6_dict.keys())[0] + '.besties.txt', sep='\t', header=None, index=None)
bestie_df['h_protein'].to_csv(list(outfmt6_dict.keys())[1] + '.besties.txt', sep='\t', header=None, index=None)


#this is pulling out the no blast hits at all. Should be a subset of no_besties
no_hits ={}
for key in fa_protein_dict.keys():
    if 'p_' in key:
        no_hits[key] = set(fa_protein_dict[key]) - set(outfmt6_dict_all[key]['Query'].unique())
        pd.DataFrame(list(no_hits[key])).to_csv(key + '.p_proteins.no_blast_hit.txt', sep='\t', header=None, index=None)
    if 'h_' in key:
        no_hits[key] = set(fa_protein_dict[key]) - set(outfmt6_dict_all[key]['Query'].unique())
        pd.DataFrame(list(no_hits[key])).to_csv(key + '.h_proteins.no_blast_hit.txt', sep='\t', header=None, index=None)

no_hits_filtered ={}
for key in fa_protein_dict.keys():
    if 'p_' in key:
        no_hits_filtered[key] = set(fa_protein_dict[key]) - set(outfmt6_dict[key]['Query'].unique())
        pd.DataFrame(list(no_hits_filtered[key])).to_csv(key + '.p_proteins.no_filtered_blast_hit.txt', sep='\t', header=None, index=None)
    if 'h_' in key:
        no_hits_filtered[key] = set(fa_protein_dict[key]) - set(outfmt6_dict[key]['Query'].unique())
        pd.DataFrame(list(no_hits_filtered[key])).to_csv(key + '.h_proteins.no_filtered_blast_hit.txt', sep='\t', header=None, index=None)
        

#this is now pulling out the besties
no_bestie ={}
for key in fa_protein_dict.keys():
    if 'p_' in key:
        no_bestie[key] = set(fa_protein_dict[key]) - set(bestie_df['p_protein'])
        pd.DataFrame(list(no_bestie[key])).to_csv(key + '.p_proteins.no_besties.txt', sep='\t', header=None, index=None)
    if 'h_' in key:
        no_bestie[key] = set(fa_protein_dict[key]) - set(bestie_df['h_protein'])
        pd.DataFrame(list(no_bestie[key])).to_csv(key + '.h_proteins.no_besties.txt', sep='\t', header=None, index=None)        
        
_len_out = 0
_len_pro = 0
for x in fa_protein_dict.keys():
    _len_pro += len(fa_protein_dict[x])
    _len_out += len(no_bestie[x])
_len_out += (len(bestie_df))*2 - bestie_df.duplicated(subset="p_protein", keep='last').sum() \
- bestie_df.duplicated(subset="h_protein", keep='last').sum()
_len_out == _len_pro


['Pst_E104_v1_p_ctg.anno.RepaseTPSI_filtered.protein.p_on_h.blast.alloutfmt6', 'Pst_E104_v1_h_ctg.anno.RepaseTPSI_filtered.protein.h_on_p.blast.alloutfmt6']
['Pst_E104_v1_p_ctg.anno.RepaseTPSI_filtered.protein.fa', 'Pst_E104_v1_h_ctg.anno.RepaseTPSI_filtered.protein.fa']


True

In [9]:
_len_out = 0
_len_pro = 0
for x in fa_protein_dict.keys():
    _len_pro += len(fa_protein_dict[x])
    _len_out += len(no_bestie[x])
    print("Out of %i %i have no reciprocal blast hit for %s making it %.2f percent."%(len(fa_protein_dict[x]),len(no_bestie[x]),x, (len(no_bestie[x])/len(fa_protein_dict[x])*100) ))
    print("Out of %i %i have no blast hit at all for %s making it %.2f percent."%(len(fa_protein_dict[x]),len(no_hits[x]),x, (len(no_hits[x])/len(fa_protein_dict[x])*100) ))
    print("Out of %i %i have no blast hit after filtering by AlnLght and QCov for %s making it %.2f percent."%(len(fa_protein_dict[x]),len(no_hits_filtered[x]),x, (len(no_hits_filtered[x])/len(fa_protein_dict[x])*100) ))
_len_out += (len(bestie_df))*2 - bestie_df.duplicated(subset="p_protein", keep='last').sum() \
- bestie_df.duplicated(subset="h_protein", keep='last').sum()
_len_out == _len_pro

Out of 15949 4809 have no reciprocal blast hit for Pst_E104_v1_p_ctg making it 30.15 percent.
Out of 15949 2072 have no blast hit at all for Pst_E104_v1_p_ctg making it 12.99 percent.
Out of 15949 3029 have no blast hit after filtering by AlnLght and QCov for Pst_E104_v1_p_ctg making it 18.99 percent.
Out of 14321 2974 have no reciprocal blast hit for Pst_E104_v1_h_ctg making it 20.77 percent.
Out of 14321 885 have no blast hit at all for Pst_E104_v1_h_ctg making it 6.18 percent.
Out of 14321 1323 have no blast hit after filtering by AlnLght and QCov for Pst_E104_v1_h_ctg making it 9.24 percent.


True

In [10]:
def blast_outfmt6_to_bed(x):
    blast_fo = open(x, 'r')
    blast_lines = blast_fo.readlines()
    bed_file_name = x + '.bed'
    bed_fo = open(bed_file_name, 'w+')
    for l in blast_lines:
        content = l.split('\t')
        if int(content[8]) - int(content[9]) < 1:
            print(content[1], int(content[8]) -1, content[9], content[0], content[10], "+", sep="\t", file=bed_fo) 
        else:
            print(content[1], int(content[9]) -1, content[8],  content[0], content[10], "-", sep = "\t", file=bed_fo)
    blast_fo.close()
    bed_fo.close()

In [11]:
from Bio import SeqIO
import os
import pandas as pd
import re

folder_p = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_E104_v1/blastp_on_p/'

no_besties = [x for x in os.listdir(folder_p) if x.endswith('no_besties.txt')]
gene_files = [x for x in os.listdir(folder_p) if '.gene.' in x and '.fa' in x]
no_besties.sort()
gene_files.sort()
no_bestie_dict = {}

#simply pulls in the gene sequences of missing besties
for no_b, gene_file in zip(no_besties, gene_files):
    no_bestie_list = pd.read_csv(folder_p+no_b, header=None, sep='\t')[0].tolist()
    key = no_b.split('.')[0]
    no_bestie_dict[key] = no_bestie_list
    no_bestie_list = [x.replace('evm.model', 'evm.TU') for x in no_bestie_list]
    no_bestie_seq = []
    for seq in SeqIO.parse(open(folder_p + gene_file), 'fasta'):
        if seq.id in no_bestie_list:
            no_bestie_seq.append(seq)
    out_f = folder_p + no_b[:-3].replace('protein', 'gene') + 'fa'
    f_handle = open(out_f,'w') #need to generate handle for writing and
    SeqIO.write(no_bestie_seq, f_handle, 'fasta')
    f_handle.close() #closing file afterwards again

gene_files_no_besties = [x for x in os.listdir(folder_p) if x.endswith('_genes.no_besties.fa')]
blast_db_nt = [x for x in os.listdir(folder_p) if x.endswith('_ctg.fa')]
gene_files_no_besties.sort()
blast_db_nt.sort()

os.chdir(folder_p)

print('blastn -db %s -query -%s > %s.outfmt6' %(blast_db_nt[1], gene_files_no_besties[0],gene_files_no_besties[0]))
!blastn -db {blast_db_nt[1]} -query {gene_files_no_besties[0]}  -outfmt 6 -evalue 1e-10 -num_threads 1 \
> {gene_files_no_besties[0]}.outfmt6


in_file =gene_files_no_besties[0]+'.outfmt6'
blast_outfmt6_to_bed(in_file)

print('blastn -db %s -query -%s > %s.outfmt6' %(blast_db_nt[0], gene_files_no_besties[1],gene_files_no_besties[1]))
!blastn -db {blast_db_nt[0]} -query {gene_files_no_besties[1]}  -outfmt 6 -evalue 1e-10 -num_threads 1 \
> {gene_files_no_besties[1]}.outfmt6


in_file =gene_files_no_besties[1]+'.outfmt6'
blast_outfmt6_to_bed(in_file)

blastn -db Pst_E104_v1_p_ctg.fa -query -Pst_E104_v1_h_ctg.h_genes.no_besties.fa > Pst_E104_v1_h_ctg.h_genes.no_besties.fa.outfmt6
blastn -db Pst_E104_v1_h_ctg.fa -query -Pst_E104_v1_p_ctg.p_genes.no_besties.fa > Pst_E104_v1_p_ctg.p_genes.no_besties.fa.outfmt6


In [12]:
no_besties_blast_nt_bed = [x for x in os.listdir(folder_p) if x.endswith('no_besties.fa.outfmt6.bed') ]
no_besties_blast_nt_bed.sort()
outfmt6.sort()

In [13]:
outfmt6

['Pst_E104_v1_h_ctg.anno.RepaseTPSI_filtered.protein.h_on_p.blast.alloutfmt6',
 'Pst_E104_v1_p_ctg.anno.RepaseTPSI_filtered.protein.p_on_h.blast.alloutfmt6']

In [14]:
no_besties_blast_nt_bed

['Pst_E104_v1_h_ctg.h_genes.no_besties.fa.outfmt6.bed',
 'Pst_E104_v1_p_ctg.p_genes.no_besties.fa.outfmt6.bed']

In [15]:
#this needs to include some folder tracking of gene.no_besties.fa that hits nothing significant 
#no_bbb in - no_bbb out = no_hits at all
no_gene_hits = {}
for no_bbb, protein_blast in zip(no_besties_blast_nt_bed,outfmt6):
    no_bbb_no_protein_blast_df =''
    no_bbb_df_header = ['Contig', 'start', 'end', 'blast_query', 'e-value', 'strand']
    no_bbb_df = pd.read_csv(folder_p+no_bbb, header=None, names=no_bbb_df_header,  sep='\t')
    protein_blast_df = pd.read_csv(folder_p+protein_blast, header=None, sep='\t')
    no_bbb_df['protein_id'] = no_bbb_df['blast_query'].str.replace('evm.TU', 'evm.model')
    #this below is most likely correct ignores the fact that some no_bbb genes might have hit nothing
    #at all on the gene level
    no_bbb_no_protein_blast_df = no_bbb_df[~no_bbb_df['protein_id'].isin(protein_blast_df[0])]
    #these are the no_besties that didn't hit anything at the gene level
    key =''
    key = no_bbb.split('.')[0]
    no_gene_hits[key] = set(no_bestie_dict[key]) - set(no_bbb_df['protein_id'].unique())
    pd.DataFrame(list(no_gene_hits[key])).to_csv(key + '.gene.no_genome_blast_hit.txt', sep='\t', header=None, index=None)
    blast_p_no_bestie =''
    blast_p_no_bestie = len(no_bbb_df[no_bbb_df['protein_id'].isin(protein_blast_df[0])]['blast_query'].unique())
    print('This %i out of %i no_besties of %s had a blast hit which was not RBH' % \
          (blast_p_no_bestie, len(no_bestie_dict[key]),no_bbb.split('.')[0]))
    print('This %i out of %i no_besties of %s have no blast hit gene vs. other haplome' % \
         (len(no_gene_hits[key]),len(no_bestie_dict[key]),no_bbb.split('.')[0]))
    print("No gene hits that have a protein hit", len(set(no_gene_hits[key])- set(no_hits[key])), key)
    groups = no_bbb_no_protein_blast_df.groupby(by='blast_query')
    #now filter the dataframe by the smallest e-value for each group == blast_hit
    df_filtered = groups.apply(lambda g: g[g['e-value'] == g['e-value'].min()])
    df_filtered = df_filtered.reset_index(drop=True)
    df_filtered.iloc[:,0:6].to_csv(folder_p+no_bbb[:-4]+'.filteredbesthits.bed', sep='\t', header=None, index=None)

This 1993 out of 2974 no_besties of Pst_E104_v1_h_ctg had a blast hit which was not RBH
This 171 out of 2974 no_besties of Pst_E104_v1_h_ctg have no blast hit gene vs. other haplome
No gene hits that have a protein hit 96 Pst_E104_v1_h_ctg
This 2105 out of 4809 no_besties of Pst_E104_v1_p_ctg had a blast hit which was not RBH
This 1708 out of 4809 no_besties of Pst_E104_v1_p_ctg have no blast hit gene vs. other haplome
No gene hits that have a protein hit 632 Pst_E104_v1_p_ctg


In [16]:
#all primary proteins no hit need to be split up into pwh and pwoh
p_contig_list = []
h_contig_list = []
for seq in SeqIO.parse('Pst_E104_v1_h_ctg.fa', 'fasta'):
    h_contig_list.append(seq.id)
for seq in SeqIO.parse('Pst_E104_v1_p_ctg.fa', 'fasta'):
    p_contig_list.append(seq.id)

In [17]:
pwh_set = set([x[0:11].replace('h','p') for x in h_contig_list])
pwoh_set = set(p_contig_list) - pwh_set
print("P_contigs with h_contig are %i and without %i" % (len(pwh_set), len(pwoh_set)))

P_contigs with h_contig are 102 and without 74


In [18]:
fa_protein_dict['Pst_E104_v1_p_ctg_pwh']= [x for x in fa_protein_dict['Pst_E104_v1_p_ctg'] if x.split('.')[2] in pwh_set]
fa_protein_dict['Pst_E104_v1_p_ctg_pwoh']= [x for x in fa_protein_dict['Pst_E104_v1_p_ctg'] if x.split('.')[2] in pwoh_set]
print(len(fa_protein_dict['Pst_E104_v1_p_ctg_pwh']), len(fa_protein_dict['Pst_E104_v1_p_ctg_pwoh']), len (fa_protein_dict['Pst_E104_v1_p_ctg']))

15306 643 15949


In [19]:
p_txt = [x for x in os.listdir(folder_p) if x.split('.')[0] == 'Pst_E104_v1_p_ctg' and x.endswith('.txt')]

In [21]:
def pwh_filter (x):
    p_contig = x.split('.')[2]
    if p_contig in pwh_set:
        return 1
    else:
        return 0

In [22]:
#filter and summarize the p results based on pwh and pwoh 
for x in p_txt:
    df_p = pd.read_csv(x, header=None, sep='\t')
    df_p.head()
    df_p['pwh'] = df_p[0].apply(pwh_filter)
    df_p[df_p['pwh'] == 1].to_csv(x[:-4]+'pwh.txt', sep ='\t', header=None, index=None)
    df_p[df_p['pwh'] == 0].to_csv(x[:-4]+'pwoh.txt', sep ='\t', header=None, index=None)
    print ('For pwh:')
    print('For this condition %s %i proteins out of %i (%.2f) are affected for pwh'% \
          (x, sum(df_p['pwh']),len(fa_protein_dict['Pst_E104_v1_p_ctg_pwh']), \
           sum(df_p['pwh'])/len(fa_protein_dict['Pst_E104_v1_p_ctg_pwh'])*100 ))
    print ('For pwoh:')
    print('For this condition %s %i proteins out of %i (%i) are affected for pwoh'% \
         (x, len(df_p['pwh']) - sum(df_p['pwh']),len(fa_protein_dict['Pst_E104_v1_p_ctg_pwoh']),\
        (len(df_p['pwh']) - sum(df_p['pwh']))/len(fa_protein_dict['Pst_E104_v1_p_ctg_pwoh'])*100 ))
     

For pwh:
For this condition Pst_E104_v1_p_ctg.p_proteins.no_blast_hitpwh.txt 1823 proteins out of 15306 (11.91) are affected for pwh
For pwoh:
For this condition Pst_E104_v1_p_ctg.p_proteins.no_blast_hitpwh.txt 0 proteins out of 643 (0) are affected for pwoh
For pwh:
For this condition Pst_E104_v1_p_ctg.gene.no_genome_blast_hit.txt 1456 proteins out of 15306 (9.51) are affected for pwh
For pwoh:
For this condition Pst_E104_v1_p_ctg.gene.no_genome_blast_hit.txt 252 proteins out of 643 (39) are affected for pwoh
For pwh:
For this condition Pst_E104_v1_p_ctg.p_proteins.no_blast_hit.txt 1823 proteins out of 15306 (11.91) are affected for pwh
For pwoh:
For this condition Pst_E104_v1_p_ctg.p_proteins.no_blast_hit.txt 249 proteins out of 643 (38) are affected for pwoh
For pwh:
For this condition Pst_E104_v1_p_ctg.besties.txt 14428 proteins out of 15306 (94.26) are affected for pwh
For pwoh:
For this condition Pst_E104_v1_p_ctg.besties.txt 292 proteins out of 643 (45) are affected for pwoh
For

In [34]:
from Bio import SeqIO
import os

os.chdir('/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_E104_v1/blastp_on_p')
len_pwh = 0
len_pwoh = 0
for seq in SeqIO.parse('Pst_E104_v1_p_ctg.fa', 'fasta'):
    if seq.id in pwh_set:
        len_pwh = len_pwh + len(seq.seq)
    if seq.id in pwoh_set:
        len_pwoh = len_pwoh + len(seq.seq)
print("Lenght of pwoh %i, lenght of pwo %i, total length p %i" %(len_pwh,len_pwoh,len_pwh+len_pwoh ))

Lenght of pwoh 79847369, lenght of pwo 4178824, total length p 84026193


In [28]:
len(pwh_set)

102

In [26]:
len(seq.seq)

25036

In [114]:
p_txt

['Pst_E104_v1_p_ctg.gene.no_genome_blast_hit.txt',
 'Pst_E104_v1_p_ctg.p_proteins.no_blast_hit.txt',
 'Pst_E104_v1_p_ctg.besties.txt',
 'Pst_E104_v1_p_ctg.p_proteins.no_filtered_blast_hit.txt',
 'Pst_E104_v1_p_ctg.p_proteins.no_besties.txt']

In [29]:
#maybe keep track of those that have a blast hit but no bestie
#yes save those to file and have a look at those as well e.g. p-values and best hits etc.
len(no_bbb_df[no_bbb_df['protein_id'].isin(protein_blast_df[0])]['blast_query'].unique())

1993

In [50]:
from pybedtools import BedTool

In [51]:
#maybe better to do this feature 
no_besties_h_p_blast_hits = BedTool('Pst_E104_v1_h_ctg.h_genes.no_besties.fa.outfmt6.filteredbesthits.bed')

In [52]:
no_besties_h_p_blast_hits.head()

pcontig_000	38121	38929	evm.TU.hcontig_000_003.11	0.0	-
 pcontig_006	564497	566065	evm.TU.hcontig_000_003.125	0.0	+
 pcontig_000	498148	498837	evm.TU.hcontig_000_003.138	0.0	+
 pcontig_000	493698	494516	evm.TU.hcontig_000_003.139	0.0	+
 pcontig_000	647841	649077	evm.TU.hcontig_000_003.176	0.0	-
 pcontig_008	304179	305415	evm.TU.hcontig_000_003.176	0.0	+
 pcontig_023	757954	758439	evm.TU.hcontig_000_003.198	9.999999999999999e-108	-
 pcontig_000	948664	949090	evm.TU.hcontig_000_003.235	0.0	-
 pcontig_018	829342	829768	evm.TU.hcontig_000_003.235	0.0	-
 pcontig_046	202834	203260	evm.TU.hcontig_000_003.235	0.0	-
 

In [53]:
p_genes = BedTool('Pst_E104_v1_p_ctg.gene.RepaseTPSI_filtered.gff3')

In [54]:
p_genes.head()

pcontig_241	EVM	evm.TU.pcontig_241.2	13967	14266	.	+	.	ID=evm.TU.pcontig_241.2;Name=EVM%20prediction%2pcontig_0241.2
 pcontig_241	EVM	evm.TU.pcontig_241.1	4229	4450	.	-	.	ID=evm.TU.pcontig_241.1;Name=EVM%20prediction%2pcontig_0241.1
 pcontig_193	EVM	evm.TU.pcontig_193.6	5386	5862	.	+	.	ID=evm.TU.pcontig_193.6;Name=EVM%20prediction%2pcontig_0193.6
 pcontig_225	EVM	evm.TU.pcontig_225.7	22308	22664	.	+	.	ID=evm.TU.pcontig_225.7;Name=EVM%20prediction%2pcontig_0225.7
 pcontig_225	EVM	evm.TU.pcontig_225.6	18672	21163	.	-	.	ID=evm.TU.pcontig_225.6;Name=EVM%20prediction%2pcontig_0225.6
 pcontig_225	EVM	evm.TU.pcontig_225.1	2067	2309	.	+	.	ID=evm.TU.pcontig_225.1;Name=EVM%20prediction%2pcontig_0225.1
 pcontig_225	EVM	evm.TU.pcontig_225.3	3999	5386	.	+	.	ID=evm.TU.pcontig_225.3;Name=EVM%20prediction%2pcontig_0225.3
 pcontig_225	EVM	evm.TU.pcontig_225.5	17272	18147	.	+	.	ID=evm.TU.pcontig_225.5;Name=EVM%20prediction%2pcontig_0225.5
 pcontig_225	EVM	evm.TU.pcontig_225.2	2658	2883	.	+	.	ID=evm.TU.p

In [55]:
no_besties_h_p_gene_intersect = no_besties_h_p_blast_hits.intersect(p_genes, wb=True, f=1)

In [56]:
no_besties_h_p_gene_intersect.saveas('Test.bed')

<BedTool(Test.bed)>

In [57]:
test_df = pd.read_csv('Test.bed', header=None, sep='\t')

In [58]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,pcontig_000,647841,649077,evm.TU.hcontig_000_003.176,0.0,-,pcontig_000,EVM,evm.TU.pcontig_000.162,647842,649110,.,-,.,ID=evm.TU.pcontig_000.162;Name=EVM%20predictio...
1,pcontig_008,304179,305415,evm.TU.hcontig_000_003.176,0.0,+,pcontig_008,EVM,evm.TU.pcontig_008.42,303889,305415,.,+,.,ID=evm.TU.pcontig_008.42;Name=EVM%20prediction...
2,pcontig_072,306840,307338,evm.TU.hcontig_000_003.305,0.0,-,pcontig_072,EVM,evm.TU.pcontig_072.58,305375,307338,.,-,.,ID=evm.TU.pcontig_072.58;Name=EVM%20prediction...
3,pcontig_000,1800902,1802027,evm.TU.hcontig_000_003.400,0.0,-,pcontig_000,EVM,evm.TU.pcontig_000.404,1799927,1802027,.,-,.,ID=evm.TU.pcontig_000.404;Name=EVM%20predictio...
4,pcontig_000,1782634,1783737,evm.TU.hcontig_000_003.400,0.0,-,pcontig_000,EVM,evm.TU.pcontig_000.400,1781648,1783737,.,-,.,ID=evm.TU.pcontig_000.400;Name=EVM%20predictio...


In [59]:
test_df["LenBlast"] = abs(test_df[1] - test_df[2])
test_df["Lenfeature"] = abs(test_df[9]-test_df[10])

In [60]:
test_df['Diff'] = abs(test_df['LenBlast'] -test_df['Lenfeature'])

In [61]:
test_df[(test_df['Diff'] == 0)]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,LenBlast,Lenfeature,Diff
28,pcontig_001,441768,443358,evm.TU.hcontig_001_123.9,0.0,+,pcontig_001,EVM,evm.TU.pcontig_001.110,441769,443359,.,+,.,ID=evm.TU.pcontig_001.110;Name=EVM%20predictio...,1590,1590,0


In [95]:
test_df.groupby(by=3).apply(lambda g: g[g['e-value'] == g['e-value'].min()]) 

KeyError: 'e-value'

In [72]:
len(no_besties_h_p_gene_intersect)

3079

In [62]:
len(no_besties_h_p_blast_hits)

16797

In [52]:
no_besties_h_p_gene_intersect[0]

Interval(pcontig_000:38121-38929)