Idea is to read in the .tsv file of interproscan and pull out the following when having run interproscan with the following option.
This was run like:
$INTPRO/interproscan.sh -i ../../../v91_cns_gcoords_curs_ph_ctg_p_ctg.evm.all.protein.fa -iprlookup -goterms -pa
and produced following file
v91_cns_gcoords_curs_ph_ctg_p_ctg.evm.all.protein.intpro.fa.tsv


In [1]:
import pandas as pd
import os
import re
from Bio import SeqIO
import pysam
from Bio.SeqRecord import SeqRecord
from pybedtools import BedTool
import numpy as np
import pybedtools
import time
import matplotlib.pyplot as plt
import sys
import subprocess
import shutil

In [2]:
BASE_FOLDER = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/funnotate/Pst_79p/07112016_Pst79_p_funannotate/annotate_misc/iprscan'
INTERPRO_TSV_FILE = 'v91_cns_gcoords_curs_ph_ctg_p_ctg.evm.all.protein.intpro.fa.tsv'
BASE_AA_PATH = '/home/benjamin/genome_assembly/PST79/FALCON/p_assemblies/v9_1/Pst_104E_v12'
OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation')
if not os.path.exists(OUT_PATH):
    os.mkdir(OUT_PATH)

In [3]:
p_genome = 'Pst_104E_v12_p_ctg'
OUT_PATH_P = os.path.join(OUT_PATH, p_genome)
if not os.path.exists(OUT_PATH_P):
    os.mkdir(OUT_PATH_P)

In [4]:
interpro_header = ['Protein_ID' , 'MD5', 'Length', 'DB', 'DB_accession', 'DB_description', 'Start_position', 'Stop_position', \
                  'e-value', 'Match Status', 'date', 'InterPro_ID', 'InterPro_description','GO_terms', 'Pathway_IDs' ]

In [5]:
interpro_df = pd.read_csv(os.path.join(BASE_FOLDER, INTERPRO_TSV_FILE), sep ='\t', header=None, names=interpro_header)

In [10]:
#rename protein names depending on the genome h or p
if p_genome.endswith('p_ctg'):
    rename_df = interpro_df.Protein_ID.str.extract(r'000([0-9]*)F_quiver.([0-9]*)')
    rename_df['Updated_Protein_ID'] = 'evm.model.pcontig_' + rename_df[0] + '.' + rename_df[1]
elif p_genome.endswith('h_ctg'):
    rename_df = interpro_df.Protein_ID.str.extract(r'000([0-9]*)F_([0-9]*)_quiver.([0-9]*)')
    rename_df['Updated_Protein_ID'] = 'evm.model.pcontig_' + rename_df[0] + '_' + rename_df[1] +'.' + rename_df[2]
interpro_df['Updated_Protein_ID'] =rename_df['Updated_Protein_ID']

  app.launch_new_instance()


In [11]:
interpro_df.head()

Unnamed: 0,Protein_ID,MD5,Length,DB,DB_accession,DB_description,Start_position,Stop_position,e-value,Match Status,date,InterPro_ID,InterPro_description,GO_terms,Pathway_IDs,Updated_Protein_ID
0,evm.model.000004F_quiver.189,299f11c027c8d3d5760103bf57d10fdc,350,Gene3D,G3DSA:3.40.50.720,,9,197,2.4e-71,T,09-11-2016,IPR016040,NAD(P)-binding domain,0,,evm.model.pcontig_004.189
1,evm.model.000004F_quiver.189,299f11c027c8d3d5760103bf57d10fdc,350,Pfam,PF07479,NAD-dependent glycerol-3-phosphate dehydrogena...,201,346,1.3e-42,T,09-11-2016,IPR006109,"Glycerol-3-phosphate dehydrogenase, NAD-depend...",GO:0004367|GO:0005975|GO:0055114,KEGG: 00564+1.1.1.94|MetaCyc: PWY-5667|MetaCyc...,evm.model.pcontig_004.189
2,evm.model.000004F_quiver.189,299f11c027c8d3d5760103bf57d10fdc,350,PANTHER,PTHR11728:SF8,,1,346,1e-182,T,09-11-2016,,,0,,evm.model.pcontig_004.189
3,evm.model.000004F_quiver.189,299f11c027c8d3d5760103bf57d10fdc,350,Gene3D,G3DSA:1.10.1040.10,,201,350,2.7e-53,T,09-11-2016,IPR013328,"6-phosphogluconate dehydrogenase, domain 2",GO:0016491|GO:0055114,,evm.model.pcontig_004.189
4,evm.model.000004F_quiver.189,299f11c027c8d3d5760103bf57d10fdc,350,PRINTS,PR00077,NAD-dependent glycerol-3-phosphate dehydrogena...,76,103,1.0000000000000001e-67,T,09-11-2016,IPR006168,"Glycerol-3-phosphate dehydrogenase, NAD-dependent",GO:0004367|GO:0006072|GO:0009331|GO:0055114,KEGG: 00564+1.1.1.94|MetaCyc: PWY-5667|MetaCyc...,evm.model.pcontig_004.189


In [12]:
interpro_df.GO_terms.fillna(0, inplace = True)

In [13]:
interpro_by_protein = interpro_df.groupby('Updated_Protein_ID')

In [14]:
interpro_by_protein_GO = interpro_by_protein.GO_terms.apply(set)

In [15]:
#remove everything without GO term attached
interpro_by_protein_GO = interpro_by_protein_GO[~(interpro_by_protein_GO == {0})]

In [16]:
interpro_by_protein_GO_dict = dict(zip(interpro_by_protein_GO.index, interpro_by_protein_GO))

In [17]:
ALL_GO_LIST = []
ALL_PROTEIN_INDEX_LIST = []
for key in list(interpro_by_protein_GO_dict.keys()):
    _tmp_list = list(interpro_by_protein_GO_dict[key])
    #remove 0 
    _tmp_list = [x for x in _tmp_list if x != 0]
    new_value = []
    for x in _tmp_list: 
        if '|' in x:
            _list = x.split('|')
            for y in _list:
                new_value.append(y)
        else:
            new_value.append(x)
    new_value = list(set(new_value))
    new_index = [key]*len(new_value)
    ALL_GO_LIST += new_value
    ALL_PROTEIN_INDEX_LIST += new_index


In [19]:
GO_df = pd.DataFrame([ALL_PROTEIN_INDEX_LIST, ALL_GO_LIST]).T

In [21]:
GO_df.to_csv(os.path.join(OUT_PATH_P, 'GO_terms_interpro.tab') , sep = '\t', header =None, index=None)