This is a parser to get all annotations in a single file for kinfin analysis. Pull in files from the following two folders:

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017
and make a effector tablist for effectors into the same folder from

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists

The layout of functional annotation file is as follows.

 \#protein_id GO IPR SignalP_EUK Pfam Effector Merops KEGG

with the following characteriztics

protein_id is simple the id used in clustering as well
GO is a list of GO terms as GO:XX;GO:XX
IPR is the ; sperated list of IPRterm:count for now this count will be one in all cases
The same is true for all other categories as well.

In [1]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import sys
import matplotlib.pyplot as plt
import subprocess
import shutil
from collections import Counter

In [2]:
genome = 'Pst_104E_v12_p_ctg'
LIST_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists'
ANNOTATION_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017'
BASE_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12'
ASSEMBLY_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'
KINFIN_FOLDER = os.path.join(BASE_FOLDER, 'KinFin')
if not os.path.exists(KINFIN_FOLDER):
    os.mkdir(KINFIN_FOLDER)
#out file name for kinfin
kinfin_out_fn = os.path.join(KINFIN_FOLDER, genome + '.functional_annotation.txt')

In [3]:
#make an initial dataframe that has the protein_id has index
protein_fn = [os.path.join(ASSEMBLY_FOLDER, x) for x in os.listdir(ASSEMBLY_FOLDER) if genome in x\
             and  x.endswith('protein.fa')][0]

In [4]:
#now generate a list of ids and length
protein_id_list = []
protein_len_list = []
for seq in SeqIO.parse(protein_fn, 'fasta'):
    protein_id_list.append(seq.id)
    protein_len_list.append(len(seq.seq))
#make a dataframe out of it
kinfin_fa_df = pd.concat([pd.Series(protein_id_list, name='#protein_id'),\
                          pd.Series(protein_len_list, name='protein_len')], axis=1)

In [5]:
#get the GO list and add to the dataframe
GO_fn = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER) \
        if x.startswith(genome) and x.endswith('GO_combined.tablist')][0]
GO_df = pd.read_csv(GO_fn, header=None, sep='\t', names=['#protein_id', 'GO'])
#merge the GO_df and the kinfin_df
kinfin_fa_df = pd.merge(kinfin_fa_df, GO_df, how='outer', on='#protein_id')

In [6]:
#now pull in all the annotations list file names into a dictionary
anno_keys = ['merops', 'Pfam', 'busco', 'iprscan', 'dbCAN', 'KEGG_combined', 'OGs','SignalP3']
anno_fn_dict = {}
for key in anno_keys:
    #print(key)
    anno_fn_dict[key] = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER)\
                        if x.startswith(genome) and key in x][0]

In [7]:
def kf_count(_comma_string):
    """
    Quick function that confered a tablist ; seperated id list to a kinfin suitable occurance count.
    """
    _list = _comma_string.split(';')
    _dict = Counter(_list)
    _new_string = ''
    for x in _dict.keys():
        _new_string = '%s;%s:%i'% (_new_string,x, _dict[x])
    return _new_string[1:]

In [8]:
len(kinfin_fa_df)

15928

In [9]:
#now loop throught those anno_fn_dict and pull in the dataframes
kinfin_fa_df
for key in anno_keys:
    #if key in ['merops', ]
    column_id = key.upper()
    tmp_df = pd.read_csv(anno_fn_dict[key], sep='\t', header=None, names=['#protein_id', key])
    #remove the ":" which a left over from funnanotate
    if ":" in tmp_df.loc[0,key]:
        tmp_df[column_id] = tmp_df[key].apply(lambda x: x[x.rindex(':')+1:])
    else:
        tmp_df[column_id] = tmp_df[key]
    #transform the tablist to a counter list sperated by ':'
    tmp_df[column_id] = tmp_df[column_id].apply(lambda x: kf_count(x))
    kinfin_fa_df = pd.merge(kinfin_fa_df, tmp_df.loc[:, ['#protein_id', column_id]], how='outer', on="#protein_id")


In [10]:
len(kinfin_fa_df)

15928

This fixes it for now. Now pull in the effector list and the haustoria expression list as well.

In [11]:
#now get haustoria expressed genes and effector genes (effectorP and in planta upregulated secreted genes)
#for the primary contig cluster 15 is haustoria expressed in for the haplogtigs cluster 15
p_effectorP_fn = os.path.join(LIST_FOLDER, 'Pst_104E_v12_p_effector.list')
p_haustoria_fn = os.path.join(LIST_FOLDER, 'Pst_104E_v12_cluster_8.list')
#now add the those to the dataframe
p_effectorP_df = pd.read_csv(p_effectorP_fn, sep='\t', header=None, names=['#protein_id'])
p_effectorP_df['EFFECTOR'] = 'Effector:1'
kinfin_fa_df = pd.merge(kinfin_fa_df, p_effectorP_df, how='outer', on="#protein_id")
#now add the haustoria expression
p_haustoria_df = pd.read_csv(p_haustoria_fn, sep='\t', header=None, names=['#protein_id'])
p_haustoria_df["SHAUSTORIA"] = 'Haustoria:1'
kinfin_fa_df = pd.merge(kinfin_fa_df, p_haustoria_df, how='outer', on="#protein_id")

In [12]:
#now write out two versions one with length included and one without
kinfin_fa_df.fillna('None', inplace=True)

In [13]:
len(kinfin_fa_df)

15928

In [14]:
len(p_effectorP_df['EFFECTOR'])

1572

In [15]:
kinfin_long_fn = kinfin_out_fn.replace('.txt', '_wlength.txt')
kinfin_fa_df.to_csv(kinfin_long_fn, sep='\t', index=None)
#now write out kinfin version
kinfin_fa_df.drop('protein_len', 1).to_csv(kinfin_out_fn, sep='\t', index=None)

In [16]:
kinfin_fa_df.head()

Unnamed: 0,#protein_id,protein_len,GO,MEROPS,PFAM,BUSCO,IPRSCAN,DBCAN,KEGG_COMBINED,OGS,SIGNALP3,EFFECTOR,SHAUSTORIA
0,evm.model.pcontig_041.101,370,,,,,,,,0ZGCE@NOG:1;0PZ4N@fuNOG:1;14B0R@opiNOG:1;095NX...,,,
1,evm.model.pcontig_018.256,534,GO:0000271;GO:0003674;GO:0003824;GO:0004610;GO...,,PF00408:1;PF02878:1,EOG092R06LD:1,IPR005844:1;IPR016055:6;IPR016657:2;IPR016066:...,,map01110:1;map00520:1,KOG2537@euNOG:1;12PKN@opiNOG:1;0PHNA@fuNOG:1;0...,,,
2,evm.model.pcontig_018.216,951,GO:0000003;GO:0003006;GO:0005575;GO:0005622;GO...,,PF03635:1,EOG092R022P:1,IPR005378:5,,,091W6@basNOG:1;0XNXC@NOG:1;KOG1107@euNOG:1;0PG...,,,
3,evm.model.pcontig_014.344,1191,GO:0004190;GO:0006508,,PF00077:1,,IPR001995:1;IPR021109:2;IPR018061:1;IPR001969:1,,,0PJX6@fuNOG:1;COG2801@NOG:1;KOG0017@euNOG:1;13...,,,
4,evm.model.pcontig_022.278,702,GO:0000086;GO:0000166;GO:0000278;GO:0000322;GO...,,PF16579:1;PF00069:1,,IPR011009:1;IPR008271:1;IPR028375:2;IPR032270:...,,map04151:1;map04150:1;map04113:1,0XNQ0@NOG:1;0927K@basNOG:1;0PGMD@fuNOG:1;12NJJ...,,,


Now add some more analysis in regards to several aspects to this dataframe. For example protein length, annotation % and so on.

In [17]:
kinfin_fa_df.protein_len.mean()

394.5913485685585

In [18]:
n_effectors = kinfin_fa_df[kinfin_fa_df['EFFECTOR'] == 'Effector:1'].count()[0]
print(n_effectors)

1572


In [19]:
len(p_effectorP_df)

1572

In [20]:
kinfin_fa_df[kinfin_fa_df['EFFECTOR'] != 'None']['protein_len'].mean()

251.66284987277353

In [21]:
annotation_list = ['GO', 'IPRSCAN','PFAM', 'OGS', 'KEGG_COMBINED','DBCAN',  'MEROPS','SIGNALP3',  ]

In [22]:
n_effectors = kinfin_fa_df[(kinfin_fa_df['EFFECTOR'] != 'None')].count()[0]
print('Effectors with length %i in %i' % (kinfin_fa_df[(kinfin_fa_df['EFFECTOR'] != 'None')]\
     ['protein_len'].mean(), n_effectors))
for x in annotation_list:

    number_of_annotations = kinfin_fa_df[(kinfin_fa_df['EFFECTOR'] != 'None')&(kinfin_fa_df[x] != 'None')].count()[0]
    print('%i/%0.2f' % (number_of_annotations, round(number_of_annotations/n_effectors*100, 2)))

for x in annotation_list:

    number_of_annotations = kinfin_fa_df[(kinfin_fa_df['EFFECTOR'] != 'None')&(kinfin_fa_df[x] != 'None')].count()[0]
    print('%i/%0.2f' % (number_of_annotations, round(number_of_annotations/n_effectors*100, 2)))


Effectors with length 251 in 1572
145/9.22
193/12.28
165/10.50
206/13.10
43/2.74
31/1.97
10/0.64
1572/100.00
145/9.22
193/12.28
165/10.50
206/13.10
43/2.74
31/1.97
10/0.64
1572/100.00


In [23]:
print("All genes")
print('All proteins with length %i in %i' % (kinfin_fa_df\
     ['protein_len'].mean(), 15928))
for x in annotation_list:
    number_of_annotations = kinfin_fa_df[(kinfin_fa_df[x] != 'None')].count()[0]
    print('%i/%0.2f' % (number_of_annotations, round(number_of_annotations/15928*100, 2)))


All genes
All proteins with length 394 in 15928
5949/37.35
6678/41.93
5950/37.36
7679/48.21
2712/17.03
245/1.54
272/1.71
2430/15.26


In [24]:
kinfin_fa_df.head()

Unnamed: 0,#protein_id,protein_len,GO,MEROPS,PFAM,BUSCO,IPRSCAN,DBCAN,KEGG_COMBINED,OGS,SIGNALP3,EFFECTOR,SHAUSTORIA
0,evm.model.pcontig_041.101,370,,,,,,,,0ZGCE@NOG:1;0PZ4N@fuNOG:1;14B0R@opiNOG:1;095NX...,,,
1,evm.model.pcontig_018.256,534,GO:0000271;GO:0003674;GO:0003824;GO:0004610;GO...,,PF00408:1;PF02878:1,EOG092R06LD:1,IPR005844:1;IPR016055:6;IPR016657:2;IPR016066:...,,map01110:1;map00520:1,KOG2537@euNOG:1;12PKN@opiNOG:1;0PHNA@fuNOG:1;0...,,,
2,evm.model.pcontig_018.216,951,GO:0000003;GO:0003006;GO:0005575;GO:0005622;GO...,,PF03635:1,EOG092R022P:1,IPR005378:5,,,091W6@basNOG:1;0XNXC@NOG:1;KOG1107@euNOG:1;0PG...,,,
3,evm.model.pcontig_014.344,1191,GO:0004190;GO:0006508,,PF00077:1,,IPR001995:1;IPR021109:2;IPR018061:1;IPR001969:1,,,0PJX6@fuNOG:1;COG2801@NOG:1;KOG0017@euNOG:1;13...,,,
4,evm.model.pcontig_022.278,702,GO:0000086;GO:0000166;GO:0000278;GO:0000322;GO...,,PF16579:1;PF00069:1,,IPR011009:1;IPR008271:1;IPR028375:2;IPR032270:...,,map04151:1;map04150:1;map04113:1,0XNQ0@NOG:1;0927K@basNOG:1;0PGMD@fuNOG:1;12NJJ...,,,


In [25]:
haustoria_secreted_proteins = kinfin_fa_df[(kinfin_fa_df['SHAUSTORIA'] != 'None')].count()[0]
print('Haustoria secreted proteins with length %i in %i' % (kinfin_fa_df[(kinfin_fa_df['SHAUSTORIA'] != 'None')]\
     ['protein_len'].mean(), haustoria_secreted_proteins))
for x in annotation_list:
    number_of_annotations = kinfin_fa_df[(kinfin_fa_df['SHAUSTORIA'] != 'None')&(kinfin_fa_df[x] != 'None')].count()[0]
    print('%i/%0.2f' % (number_of_annotations, round(number_of_annotations/haustoria_secreted_proteins*100, 2)))


Haustoria secreted proteins with length 410 in 308
24/7.79
25/8.12
23/7.47
22/7.14
6/1.95
3/0.97
3/0.97
308/100.00


In [26]:
busco_proteins = kinfin_fa_df[(kinfin_fa_df['BUSCO'] != 'None')].count()[0]
print('BUSCOs with length %i in %i' % (kinfin_fa_df[(kinfin_fa_df['BUSCO'] != 'None')]\
     ['protein_len'].mean(), busco_proteins))
for x in annotation_list:
    #print(x)
    number_of_annotations = kinfin_fa_df[(kinfin_fa_df['BUSCO'] != 'None')&(kinfin_fa_df[x] != 'None')].count()[0]
    print('%i/%0.2f' % (number_of_annotations, round(number_of_annotations/busco_proteins*100, 2)))


BUSCOs with length 606 in 1444
1386/95.98
1399/96.88
1352/93.63
1433/99.24
811/56.16
14/0.97
54/3.74
27/1.87


In [27]:
tmp_df = kinfin_fa_df.loc[:,['#protein_id','GO', 'IPRSCAN','PFAM', 'OGS', 'KEGG_COMBINED','DBCAN',  'MEROPS']].copy()

un_annotated_proteins = tmp_df[(tmp_df.GO == 'None') & (tmp_df.IPRSCAN == 'None') & (tmp_df.PFAM == 'None') & (tmp_df.OGS == 'None') & (tmp_df.KEGG_COMBINED == 'None') \
      & (tmp_df.DBCAN == 'None') & (tmp_df.MEROPS == 'None')]['#protein_id']
tmp_df = kinfin_fa_df.loc[:,['#protein_id','GO', 'IPRSCAN','PFAM', 'OGS', 'KEGG_COMBINED','DBCAN',  'MEROPS']].copy()

un_annotated_proteins = tmp_df[(tmp_df.GO == 'None') & (tmp_df.IPRSCAN == 'None') & (tmp_df.PFAM == 'None') & (tmp_df.OGS == 'None') & (tmp_df.KEGG_COMBINED == 'None') \
      & (tmp_df.DBCAN == 'None') & (tmp_df.MEROPS == 'None')]['#protein_id']

print('This is the number of unannotated proteins %i and %0.2f pct' % (len(un_annotated_proteins), \
                                                                   len(un_annotated_proteins)/15928*100))

This is the number of unannotated proteins 7590 and 47.65 pct


In [28]:
#now for BUSCOS
tmp_df = kinfin_fa_df.loc[:,['#protein_id','GO', 'IPRSCAN','PFAM', 'OGS', 'KEGG_COMBINED','DBCAN',  'MEROPS']].copy()

un_annotated_proteins = tmp_df[(kinfin_fa_df['BUSCO'] != 'None')&(tmp_df.GO == 'None') & (tmp_df.IPRSCAN == 'None') & (tmp_df.PFAM == 'None') & (tmp_df.OGS == 'None') & (tmp_df.KEGG_COMBINED == 'None') \
      & (tmp_df.DBCAN == 'None') & (tmp_df.MEROPS == 'None') ]['#protein_id']

print('This is the number of unannotated BUSCOs %i and %0.2f pct' % (len(un_annotated_proteins), \
                                                                   len(un_annotated_proteins)/busco_proteins*100))

This is the number of unannotated BUSCOs 2 and 0.14 pct


In [29]:
#now for BUSCOS
tmp_df = kinfin_fa_df.loc[:,['#protein_id','GO', 'IPRSCAN','PFAM', 'OGS', 'KEGG_COMBINED','DBCAN',  'MEROPS']].copy()

un_annotated_proteins = tmp_df[(kinfin_fa_df['EFFECTOR'] != 'None')&(tmp_df.GO == 'None') & (tmp_df.IPRSCAN == 'None') & (tmp_df.PFAM == 'None') & (tmp_df.OGS == 'None') & (tmp_df.KEGG_COMBINED == 'None') \
      & (tmp_df.DBCAN == 'None') & (tmp_df.MEROPS == 'None') ]['#protein_id']

print('This is the number of unannotated EFFECTOR %i and %0.2f pct' % (len(un_annotated_proteins), \
                                                                   len(un_annotated_proteins)/n_effectors*100))

This is the number of unannotated EFFECTOR 1307 and 83.14 pct
