This is a parser to get all annotations in a single file for kinfin analysis. Pull in files from the following two folders:

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017
and make a effector tablist for effectors into the same folder from

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists

The layout of functional annotation file is as follows.

 \#protein_id GO IPR SignalP_EUK Pfam Effector Merops KEGG

with the following characteriztics

protein_id is simple the id used in clustering as well
GO is a list of GO terms as GO:XX;GO:XX
IPR is the ; sperated list of IPRterm:count for now this count will be one in all cases
The same is true for all other categories as well.

In [1]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import sys
import matplotlib.pyplot as plt
import subprocess
import shutil
from collections import Counter

In [2]:
genome = 'Pst_104E_v12_h_ctg'
LIST_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists'
ANNOTATION_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017'
BASE_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12'
ASSEMBLY_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'
KINFIN_FOLDER = os.path.join(BASE_FOLDER, 'KinFin')
if not os.path.exists(KINFIN_FOLDER):
    os.mkdir(KINFIN_FOLDER)
#out file name for kinfin
kinfin_out_fn = os.path.join(KINFIN_FOLDER, genome + '.functional_annotation.txt')
#define the effector and haustoria expressed gene lists
#for the haplotigs cluster 15 is haustoria expressed
p_effectorP_fn = os.path.join(LIST_FOLDER, 'Pst_104E_v12_h_effector.list')
p_haustoria_fn = os.path.join(LIST_FOLDER, 'Pst_104E_v12_h_cluster_15.list')

In [3]:
#make an initial dataframe that has the protein_id has index
protein_fn = [os.path.join(ASSEMBLY_FOLDER, x) for x in os.listdir(ASSEMBLY_FOLDER) if genome in x\
             and  x.endswith('protein.fa')][0]

In [4]:
#now generate a list of ids and length
protein_id_list = []
protein_len_list = []
for seq in SeqIO.parse(protein_fn, 'fasta'):
    protein_id_list.append(seq.id)
    protein_len_list.append(len(seq.seq))
#make a dataframe out of it
kinfin_fa_df = pd.concat([pd.Series(protein_id_list, name='#protein_id'),\
                          pd.Series(protein_len_list, name='protein_len')], axis=1)

In [5]:
#get the GO list and add to the dataframe
GO_fn = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER) \
        if x.startswith(genome) and x.endswith('GO_combined.tablist')][0]
GO_df = pd.read_csv(GO_fn, header=None, sep='\t', names=['#protein_id', 'GO'])
#merge the GO_df and the kinfin_df
kinfin_fa_df = pd.merge(kinfin_fa_df, GO_df, how='outer', on='#protein_id')

In [6]:
#now pull in all the annotations list file names into a dictionary
anno_keys = ['merops', 'Pfam', 'busco', 'iprscan', 'SignalP_EUK', 'dbCAN', 'KEGG_combined', 'OGs']
anno_fn_dict = {}
for key in anno_keys:
    anno_fn_dict[key] = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER)\
                        if x.startswith(genome) and key in x][0]

In [7]:
def kf_count(_comma_string):
    """
    Quick function that confered a tablist ; seperated id list to a kinfin suitable occurance count.
    """
    _list = _comma_string.split(';')
    _dict = Counter(_list)
    _new_string = ''
    for x in _dict.keys():
        _new_string = '%s;%s:%i'% (_new_string,x, _dict[x])
    return _new_string[1:]

In [8]:
#now loop throught those anno_fn_dict and pull in the dataframes
kinfin_fa_df
for key in anno_keys:
    #if key in ['merops', ]
    column_id = key.upper()
    tmp_df = pd.read_csv(anno_fn_dict[key], sep='\t', header=None, names=['#protein_id', key])
    #remove the ":" which a left over from funnanotate
    if ":" in tmp_df.loc[0,key]:
        tmp_df[column_id] = tmp_df[key].apply(lambda x: x[x.rindex(':')+1:])
    else:
        tmp_df[column_id] = tmp_df[key]
    #transform the tablist to a counter list sperated by ':'
    tmp_df[column_id] = tmp_df[column_id].apply(lambda x: kf_count(x))
    kinfin_fa_df = pd.merge(kinfin_fa_df, tmp_df.loc[:, ['#protein_id', column_id]], how='outer', on="#protein_id")


In [9]:
anno_fn_dict

{'KEGG_combined': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.KEGG_combined.tablist',
 'OGs': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.OGs.tablist',
 'Pfam': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.Pfam.tablist',
 'SignalP_EUK': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.SignalP_EUK.tablist',
 'busco': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.busco.tablist',
 'dbCAN': '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017/Pst_104E_v12_h_ctg.dbCAN.tablist',
 'iprscan': '/home/benjamin/genome_assembly/PST79/FALCON/p_a

This fixes it for now. Now pull in the effector list and the haustoria expression list as well.

In [10]:
#now get haustoria expressed genes and effector genes (effectorP and in planta upregulated secreted genes)
#now add the those to the dataframe
p_effectorP_df = pd.read_csv(p_effectorP_fn, sep='\t', header=None, names=['#protein_id'])
p_effectorP_df['EFFECTOR'] = 'Effector:1'
kinfin_fa_df = pd.merge(kinfin_fa_df, p_effectorP_df, how='outer', on="#protein_id")
#now add the haustoria expression
p_haustoria_df = pd.read_csv(p_haustoria_fn, sep='\t', header=None, names=['#protein_id'])
p_haustoria_df["SHAUSTORIA"] = 'Haustoria:1'
kinfin_fa_df = pd.merge(kinfin_fa_df, p_haustoria_df, how='outer', on="#protein_id")

In [11]:
#now write out two versions one with length included and one without
kinfin_fa_df.fillna('None', inplace=True)

In [12]:
kinfin_out_fn

'/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/KinFin/Pst_104E_v12_h_ctg.functional_annotation.txt'

In [13]:
kinfin_long_fn = kinfin_out_fn.replace('.txt', '_wlength.txt')
kinfin_fa_df.to_csv(kinfin_long_fn, sep='\t', index=None)
#now write out kinfin version
kinfin_fa_df.drop('protein_len', 1).to_csv(kinfin_out_fn, sep='\t', index=None)

Now add some more analysis in regards to several aspects to this dataframe. For example protein length, annotation % and so on.



In [18]:
kinfin_fa_df.protein_len.mean()

391.4988478458208

In [19]:
kinfin_fa_df.columns

Index(['#protein_id', 'protein_len', 'GO', 'MEROPS', 'PFAM', 'BUSCO',
       'IPRSCAN', 'SIGNALP_EUK', 'DBCAN', 'KEGG_COMBINED', 'OGS', 'EFFECTOR',
       'SHAUSTORIA'],
      dtype='object')

In [29]:
n_effectors = kinfin_fa_df[kinfin_fa_df['EFFECTOR'] != 'None'].count()[0]
print(n_effectors)

1388


In [28]:
kinfin_fa_df[kinfin_fa_df['EFFECTOR'] != 'None']['protein_len'].mean()

240.97910662824208

In [30]:
annotation_list = ['GO', 'MEROPS', 'PFAM', 'BUSCO','IPRSCAN', 'SIGNALP_EUK', 'DBCAN', 'KEGG_COMBINED', 'OGS', ]

In [31]:
for x in annotation_list:
    print(x)
    number_of_annotations = kinfin_fa_df[(kinfin_fa_df['EFFECTOR'] != 'None')&(kinfin_fa_df[x] != 'None')].count()[0]
    print(number_of_annotations)
    print(number_of_annotations/n_effectors)

GO
44
0.0317002881844
MEROPS
0
0.0
PFAM
7
0.00504322766571
BUSCO
0
0.0
IPRSCAN
8
0.00576368876081
SIGNALP_EUK
71
0.0511527377522
DBCAN
1
0.000720461095101
KEGG_COMBINED
23
0.0165706051873
OGS
164
0.118155619597


Need to redo signalp annotation with older version of signalp e.g. v3 and not v4

In [33]:
for x in annotation_list:
    print(x)
    number_of_annotations = kinfin_fa_df[(kinfin_fa_df[x] != 'None')].count()[0]
    print(number_of_annotations)
    print(number_of_annotations/14321*100)

GO
3948
27.567907269
MEROPS
14
0.0977585364151
PFAM
396
2.76517003003
BUSCO
1293
9.02869911319
IPRSCAN
435
3.03749738147
SIGNALP_EUK
132
0.921723343342
DBCAN
9
0.0628447734097
KEGG_COMBINED
2331
16.2767963131
OGS
6919
48.3136652468


### 