This is a parser to get all annotations in a single file for kinfin analysis. Pull in files from the following two folders:

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017
and make a effector tablist for effectors into the same folder from

/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists

The layout of functional annotation file is as follows.

 \#protein_id GO IPR SignalP_EUK Pfam Effector Merops KEGG

with the following characteriztics

protein_id is simple the id used in clustering as well
GO is a list of GO terms as GO:XX;GO:XX
IPR is the ; sperated list of IPRterm:count for now this count will be one in all cases
The same is true for all other categories as well.

In [3]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import sys
import matplotlib.pyplot as plt
import subprocess
import shutil

In [6]:
genome = 'Pst_104E_v12_p_ctg'
LIST_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/lists'
ANNOTATION_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/enrichment_analysis/pa_26062017'
BASE_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12'
ASSEMBLY_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'
KINFIN_FOLDER = os.path.join(BASE_FOLDER, 'KinFin')
if not os.path.exists(KINFIN_FOLDER):
    os.mkdir(KINFIN_FOLDER)
#out file name for kinfin
kinfin_out_fn = os.path.join(KINFIN_FOLDER, genome + '.functional_annotation.txt')

In [9]:
#make an initial dataframe that has the protein_id has index
protein_fn = [os.path.join(ASSEMBLY_FOLDER, x) for x in os.listdir(ASSEMBLY_FOLDER) if genome in x\
             and  x.endswith('protein.fa')][0]

In [30]:
#now generate a list of ids and length
protein_id_list = []
protein_len_list = []
for seq in SeqIO.parse(protein_fn, 'fasta'):
    protein_id_list.append(seq.id)
    protein_len_list.append(len(seq.seq))
#make a dataframe out of it
kinfin_fa_df = pd.concat([pd.Series(protein_id_list, name='#protein_id'),\
                          pd.Series(protein_len_list, name='protein_len')], axis=1)

In [31]:
#get the GO list and add to the dataframe
GO_fn = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER) \
        if x.startswith(genome) and x.endswith('GO_combined.tablist')][0]
GO_df = pd.read_csv(GO_fn, header=None, sep='\t', names=['#protein_id', 'GO'])
#merge the GO_df and the kinfin_df
kinfin_fa_df = pd.merge(kinfin_fa_df, GO_df, how='outer', on='#protein_id')

In [37]:
#now pull in all the annotations list file names into a dictionary
anno_keys = ['merops', 'Pfam', 'busco', 'iprscan', 'SignalP_EUK', 'dbCAN', 'KEGG_combined', 'OG']
anno_fn_dict = {}
for key in anno_keys:
    anno_fn_dict[key] = [os.path.join(ANNOTATION_FOLDER, x) for x in os.listdir(ANNOTATION_FOLDER)\
                        if x.startswith(genome) and key in x][0]

In [70]:
#now loop throught those anno_fn_dict and pull in the dataframes
ext_df = kinfin_fa_df.copy() 
for key in anno_keys[:1]:
    if key in ['merops', ]
    column_id = key.upper()
    tmp_df = pd.read_csv(anno_fn_dict[key], sep='\t', header=None, names=['#protein_id', key])
    tmp_df[column_id] = tmp_df[key].apply(lambda x: x[x.rindex(':')+1:] + ":1")
    ext_df = pd.merge(ext_df, tmp_df.loc[:, ['#protein_id', column_id]], how='outer', on="#protein_id")

In [73]:
ext_df.tail()

Unnamed: 0,#protein_id,protein_len,GO,MEROPS
15923,evm.model.pcontig_058.74,724,GO:0003676,
15924,evm.model.pcontig_013.115,436,,
15925,evm.model.pcontig_005.99,565,,
15926,evm.model.pcontig_130.4,399,GO:0000139;GO:0003674;GO:0005488;GO:0005515;GO...,
15927,evm.model.pcontig_002.134,287,GO:0016491,


In [53]:
test[test.rindex(':')+1:]

'MER034960'

In [32]:
kinfin_fa_df.head()

Unnamed: 0,#protein_id,protein_len,GO
0,evm.model.pcontig_041.101,370,
1,evm.model.pcontig_018.256,534,GO:0000271;GO:0003674;GO:0003824;GO:0004610;GO...
2,evm.model.pcontig_018.216,951,GO:0000003;GO:0003006;GO:0005575;GO:0005622;GO...
3,evm.model.pcontig_014.344,1191,GO:0004190;GO:0006508
4,evm.model.pcontig_022.278,702,GO:0000086;GO:0000166;GO:0000278;GO:0000322;GO...


In [None]:
#pull out all proteins that are in the final assembly
p_protein_list = []
protein_fa_file = [x for x in os.listdir(BASE_A_PATH) if p_genome in x and x.endswith('anno.protein.fa')][0]
for protein in SeqIO.parse(os.path.join(BASE_A_PATH, protein_fa_file) , 'fasta'):
    p_protein_list.append(protein.id)

In [None]:
eggnog_blast_header = 'query_name seed_eggNOG_ortholog seed_ortholog_evalue seed_ortholog_score predicted_gene_name \
GO_terms KEGG_pathways Annotation_tax_scope OGs bestOG|evalue|score COG cat eggNOG annot'.split(' ')

In [None]:
eggnog_blast_header

In [None]:
eggnog_blast_df = pd.read_csv(os.path.join(BASE_FOLDER, EGGNOG_BLAST_FILE), sep ='\t', header=None, names=eggnog_blast_header, skiprows=3)

In [None]:
eggnog_blast_df.iloc[1,:]

In [None]:
eggnog_blast_df.fillna(0, inplace =True)

In [None]:
eggnog_blast_df.columns

In [None]:
DBs

In [None]:
#pick all annotation columns. One at a time and write them out as tab file or annotations files. The later can be used
#to annotate gff files using gag.py
DBs = [x for x in eggnog_blast_df.columns.tolist()[4:] if x not in ['GO_terms','KEGG_pathways', 'OGs' ] ]
eggnog_blast_df['note'] = 'note'
for db in DBs:
    if len(eggnog_blast_df[eggnog_blast_df[db] !=0]) > 0:
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name',db]]\
        .to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name', 'note', db]]\
        .to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)
    
    

In [None]:
working_dict = {}
eggnog_blast_df['GO_terms'] = eggnog_blast_df.GO_terms.str.split(',')

eggnog_blast_df[eggnog_blast_df['GO_terms'] != 'NaN']

In [None]:
eggnog_blast_df['GO_terms']

In [None]:
#process the pathway files
interpro_by_protein_KEGG = interpro_df[interpro_df.Pathway_IDs.str.contains('KEGG' or 'MetaCyc' or 'Reactome').fillna(False)]

interpro_by_protein_KEGG = interpro_by_protein_KEGG.groupby('Updated_Protein_ID')

#pull out all the KEGG terms and write them out as annotation files 
interpro_by_protein_KEGG = interpro_by_protein.Pathway_IDs.apply(set)

#remove everything without KEGG term attached
interpro_by_protein_KEGG = interpro_by_protein_KEGG[~(interpro_by_protein_KEGG == {0})]

interpro_by_protein_KEGG_dict = dict(zip(interpro_by_protein_KEGG.index, interpro_by_protein_KEGG))

ALL_KEGG_LIST = []
ALL_PROTEIN_INDEX_LIST = []
for key in list(interpro_by_protein_KEGG_dict.keys()):
    _tmp_list = list(interpro_by_protein_KEGG_dict[key])
    #remove 0 
    _tmp_list = [x for x in _tmp_list if x != 0]
    new_value = []
    for x in _tmp_list: 
        if '|' in x:
            _list = x.split('|')
            for y in _list:
                new_value.append(y)
        else:
            new_value.append(x)
    new_value = list(set(new_value))
    new_index = [key]*len(new_value)
    ALL_KEGG_LIST += new_value
    ALL_PROTEIN_INDEX_LIST += new_index


KEGG_df = pd.DataFrame([ALL_PROTEIN_INDEX_LIST, ALL_KEGG_LIST]).T
KEGG_df.rename(columns={0:'Updated_Protein_ID', 1:'DB_ID'}, inplace=True)

KEGG_df.to_csv(os.path.join(OUT_PATH_P_ALL, 'Pathway_terms_ipr_all.tab') , sep = '\t', header =None, index=None)
KEGG_df[KEGG_df.Updated_Protein_ID.isin(p_protein_list)].to_csv(os.path.join(OUT_PATH_P, 'Pathway_terms_ipr.tab') , sep = '\t', header =None, index=None)
#write out annotations
KEGG_df['Transfer_ID'] = 'note'
KEGG_df.loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH_P_ALL, 'annotations.Pathway_all.txt') , sep = '\t', header =None, index=None)
KEGG_df[KEGG_df.Updated_Protein_ID.isin(p_protein_list)].loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH_P, 'annotations.Pathway.txt') , sep = '\t', header =None, index=None)

In [None]:
#pull out all the GO terms and write them out as annotation files 
interpro_by_protein_GO = interpro_by_protein.GO_terms.apply(set)

#remove everything without GO term attached
interpro_by_protein_GO = interpro_by_protein_GO[~(interpro_by_protein_GO == {0})]

interpro_by_protein_GO_dict = dict(zip(interpro_by_protein_GO.index, interpro_by_protein_GO))

ALL_GO_LIST = []
ALL_PROTEIN_INDEX_LIST = []
for key in list(interpro_by_protein_GO_dict.keys()):
    _tmp_list = list(interpro_by_protein_GO_dict[key])
    #remove 0 
    _tmp_list = [x for x in _tmp_list if x != 0]
    new_value = []
    for x in _tmp_list: 
        if '|' in x:
            _list = x.split('|')
            for y in _list:
                new_value.append(y)
        else:
            new_value.append(x)
    new_value = list(set(new_value))
    new_index = [key]*len(new_value)
    ALL_GO_LIST += new_value
    ALL_PROTEIN_INDEX_LIST += new_index


GO_df = pd.DataFrame([ALL_PROTEIN_INDEX_LIST, ALL_GO_LIST]).T
GO_df.rename(columns={0:'Updated_Protein_ID', 1:'DB_ID'}, inplace=True)

GO_df.to_csv(os.path.join(OUT_PATH_P_ALL, 'GO_terms_ipr_all.tab') , sep = '\t', header =None, index=None)
GO_df[GO_df.Updated_Protein_ID.isin(p_protein_list)].to_csv(os.path.join(OUT_PATH_P, 'GO_terms_ipr.tab') , sep = '\t', header =None, index=None)
#write out annotations
GO_df['Transfer_ID'] = 'note'
GO_df.loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH_P_ALL, 'annotations.GO_all.txt') , sep = '\t', header =None, index=None)
GO_df[GO_df.Updated_Protein_ID.isin(p_protein_list)].loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH_P, 'annotations.GO.txt') , sep = '\t', header =None, index=None)

In [None]:
#filter out other annotations files including busco, dbCAN, merops, swissprot
annotation_files = [os.path.join(ANNOTATION_FILE_BASE_PATH, x) for x in os.listdir(ANNOTATION_FILE_BASE_PATH) \
                    if x.startswith('annotations') and (x.endswith('busco.txt') or x.endswith('dbCAN.txt') or x.endswith('merops.txt') or x.endswith('swissprot.txt'))]

In [None]:
for anno in annotation_files:
    _tmp_df = pd.read_csv(anno, header=None, sep='\t', names=['Protein_ID','Transfer_ID', 'DB_ID'])
    anno_file_name = anno.split('/')[-1]
    anno_midfix = anno_file_name.split('.')[1]
    _tmp_df['Updated_Protein_ID'] = protein_id_conversion(_tmp_df)
    #write out filtered down tab file
    _tmp_df[(_tmp_df.Updated_Protein_ID.isin(p_protein_list))].loc[:,['Updated_Protein_ID', 'DB_ID']].to_csv(os.path.join(OUT_PATH_P, anno_midfix+'_terms.tab'), sep = '\t', header =None, index=None)
    #write out the filtered annotation file
    _tmp_df[(_tmp_df.Updated_Protein_ID.isin(p_protein_list))].loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].\
    to_csv(os.path.join(OUT_PATH_P, anno_file_name ), sep = '\t', header =None, index=None)
     #write out not down tab file
    _tmp_df.loc[:,['Updated_Protein_ID', 'DB_ID']].to_csv(os.path.join(OUT_PATH_P_ALL, anno_midfix+'_terms_all.tab' ), sep = '\t', header =None, index=None)
    #write out the filtered annotation file
    _tmp_df.loc[:,['Updated_Protein_ID', 'Transfer_ID','DB_ID']].to_csv(os.path.join(OUT_PATH_P_ALL, anno_file_name.replace('.txt', '_all.txt') ), sep = '\t', header =None, index=None)