Idea is to read in the annotation file of emapper.py and pull out the annotations.
This was run like:
/home/benjamin/anaconda3/envs/funannotate/eggnog-mapper-0.99.2/emapper.py -i /home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly/Pst_104E_v12_h_ctg.anno.protein.fa -d euk --output Pst_104E_v12_h_ctg --cpu 12


In [1]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import matplotlib.pyplot as plt
import sys
import subprocess
import shutil

In [2]:
BASE_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12/protein_annotation/Pst_104E_v12_p_ctg/eggnog-mapper'
EGGNOG_BLAST_FILE = 'Pst_104E_v12_p_ctg.emapper.annotations'
EGGNOG_DIAMON_FILE = 'Pst_104E_v12_p_ctg_diamond.emapper.annotations'
BASE_AA_PATH = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12'
BASE_A_PATH = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/032017_assembly'
p_genome = 'Pst_104E_v12_p_ctg'
OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome, 'eggnog-mapper', 'parsed')
if not os.path.exists(OUT_PATH):
    os.mkdir(OUT_PATH)


In [3]:
#pull out all proteins that are in the final assembly
p_protein_list = []
protein_fa_file = [x for x in os.listdir(BASE_A_PATH) if p_genome in x and x.endswith('anno.protein.fa')][0]
for protein in SeqIO.parse(os.path.join(BASE_A_PATH, protein_fa_file) , 'fasta'):
    p_protein_list.append(protein.id)

In [4]:
eggnog_blast_header = 'query_name seed_eggNOG_ortholog seed_ortholog_evalue seed_ortholog_score predicted_gene_name \
GO_terms KEGG_pathways Annotation_tax_scope OGs bestOG|evalue|score COG cat eggNOG annot'.split(' ')

In [5]:
eggnog_blast_header

['query_name',
 'seed_eggNOG_ortholog',
 'seed_ortholog_evalue',
 'seed_ortholog_score',
 'predicted_gene_name',
 'GO_terms',
 'KEGG_pathways',
 'Annotation_tax_scope',
 'OGs',
 'bestOG|evalue|score',
 'COG',
 'cat',
 'eggNOG',
 'annot']

In [25]:
eggnog_blast_df = pd.read_csv(os.path.join(BASE_FOLDER, EGGNOG_BLAST_FILE), sep ='\t', header=None, names=eggnog_blast_header, skiprows=3)

In [26]:
eggnog_blast_df.iloc[1,:]

query_name                                      evm.model.pcontig_018.256
seed_eggNOG_ortholog                                        5297.EFP79055
seed_ortholog_evalue                                                    0
seed_ortholog_score                                                1060.4
predicted_gene_name                                                  PCM1
GO_terms                GO:0000271,GO:0003674,GO:0003824,GO:0004610,GO...
KEGG_pathways                                           map00520,map01110
Annotation_tax_scope                                            fuNOG[21]
OGs                     092IQ@basNOG,0PHNA@fuNOG,12PKN@opiNOG,COG1109@...
bestOG|evalue|score                                  092IQ|2.1e-258|862.0
COG                                                                     G
cat                                       Phosphoacetylglucosamine mutase
eggNOG                                                                NaN
annot                                 

In [8]:
eggnog_blast_df.fillna(0, inplace =True)

In [9]:
eggnog_blast_df.columns

Index(['query_name', 'seed_eggNOG_ortholog', 'seed_ortholog_evalue',
       'seed_ortholog_score', 'predicted_gene_name', 'GO_terms',
       'KEGG_pathways', 'Annotation_tax_scope', 'OGs', 'bestOG|evalue|score',
       'COG', 'cat', 'eggNOG', 'annot'],
      dtype='object')

In [12]:
#pick all annotation columns. One at a time and write them out as tab file or annotations files. The later can be used
#to annotate gff files using gag.py
DBs = [x for x in eggnog_blast_df.columns.tolist()[4:] if x not in ['GO_terms','KEGG_pathways', 'OGs','bestOG|evalue|score' ] ]
eggnog_blast_df['note'] = 'note'
for db in DBs:
    if len(eggnog_blast_df[eggnog_blast_df[db] !=0]) > 0:
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name',db]]\
        .to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
        eggnog_blast_df[eggnog_blast_df[db] !=0].loc[:,['query_name', 'note', db]]\
        .to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)
    
    

In [29]:
#now look after the remaining columns which have potentially multiple entries as values eg. GOterm, GOterm, GOterm
#those values need to be split into a list and this made into a new dataframe before writting it out again.
eggnog_blast_df = pd.read_csv(os.path.join(BASE_FOLDER, EGGNOG_BLAST_FILE), sep ='\t', header=None, names=eggnog_blast_header, skiprows=3)
#['GO_terms','KEGG_pathways', 'OGs']
for db in ['GO_terms','KEGG_pathways', 'OGs']:
    eggnog_blast_df[db] = eggnog_blast_df[db].str.split(',')
    all_term_list = []
    all_query_list = []
    eggnog_blast_df[db].fillna(0, inplace=True)
    for key in eggnog_blast_df[eggnog_blast_df[db] != 0 ].index:
        for term in eggnog_blast_df.loc[key,db]:
            all_term_list.append(term)
            all_query_list.append(eggnog_blast_df.loc[key,'query_name'])
    tmp_df = pd.DataFrame([all_query_list, all_term_list]).T
    tmp_df.rename(columns={0:'query_name', 1:'DB_ID'}, inplace=True)
    tmp_df.to_csv(os.path.join(OUT_PATH, db+'_terms.tab'), sep='\t', header =None, index = None)
    tmp_df['note'] = 'note'
    tmp_df.loc[:, ['query_name', 'note', 'DB_ID']].to_csv(os.path.join(OUT_PATH, 'annotations.' +db+'.txt'), sep='\t', header =None, index = None)

In [40]:
#combine KEGG annotations, and GO terms
egg_GO_file = [os.path.join(OUT_PATH, x) for x in os.listdir(OUT_PATH) if x == 'annotations.GO_terms.txt'][0] 
egg_KEGG_file = [os.path.join(OUT_PATH, x) for x in os.listdir(OUT_PATH) if x == 'annotations.KEGG_pathways.txt'][0]
interpro_GO_file = [os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome, x)\
                      for x in os.listdir(os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome)) if x == 'annotations.GO.txt'][0]
interpro_KEGG_file = [os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome, x) \
                    for x in os.listdir(os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome)) if x == 'annotations.Pathway.txt'][0]

In [41]:
GO_df = pd.concat([pd.read_csv(egg_GO_file, header=None, sep='\t'), pd.read_csv(interpro_GO_file, header=None, sep='\t')])

In [45]:
GO_df.drop_duplicates().to_csv(os.path.join(OUT_PATH, 'annotations.GO_combined.txt'), header=None, index =None, sep ='\t')

In [62]:
interpro_pathway_df = pd.read_csv(interpro_KEGG_file, header = None, sep='\t', names=['query_name','note', 'DB'])

In [63]:
interpro_pathway_df.head()

Unnamed: 0,query_name,note,DB
0,evm.model.pcontig_003.285,note,Reactome: R-HSA-163210
1,evm.model.pcontig_056.114,note,Reactome: R-HSA-72163
2,evm.model.pcontig_007.264,note,Reactome: R-HSA-2151209
3,evm.model.pcontig_007.264,note,Reactome: R-HSA-5628897
4,evm.model.pcontig_007.264,note,Reactome: R-HSA-1632852


In [64]:
interpro_KEGG_df = interpro_pathway_df[interpro_pathway_df.DB.str.contains('KEGG')]

In [81]:
interpro_KEGG_df['new_KEGG'] = 'map'+interpro_KEGG_df.DB.str.extract(r'KEGG: ([0-9]+)+')

  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [87]:
KEGG_df = pd.concat([interpro_KEGG_df.loc[:,['query_name', 'note', 'new_KEGG']],\
                     pd.read_csv(egg_KEGG_file, header=None, sep='\t', names=['query_name','note', 'new_KEGG'])])

In [89]:
KEGG_df.drop_duplicates().to_csv(os.path.join(OUT_PATH, 'annotations.KEGG_combined.txt'), header=None, index =None, sep ='\t')

In [90]:
interpro_pathway_df[~interpro_pathway_df.DB.str.contains('KEGG')].to_csv(os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome,\
                                                                                     'annotations.Pathway_no_KEGG.txt'), header=None, index=None, sep='\t')