In [1]:
viruses = '''AG−891−A17

AG−892−P18

AG−893−J23

AG−894−C07

AG−895−P08

AG−897−A15  

AG−903−F19

AG−903−I06

AG−904−O13

AG−907−C19

AG−907−I10

AG−908−F15

AG-909-A05

AG−910−E05

AG−912−O18

AG−913−C05

AG−913−C17'''.replace("−","-").split()

In [2]:
import os
import os.path as op
import pandas as pd
import numpy as np
from nb_tools import write_fa_record, readfa, run_mica
import gzip
from collections import Counter

gzopen = lambda i: gzip.open(i, 'r') if i.endswith(".gz") else open(i)

In [3]:
wd = "./outputs/vir_tests/"

In [4]:
anns = [op.join("/mnt/scgc/simon/simonsproject/bats248_annotations/gff/{i}.gff".format(i=i)) for i in viruses]

In [5]:
ofa = "./outputs/vir_tests/mp_allorfs.faa"
cfa = "./outputs/vir_tests/mp_allorfs_cdhit9.faa"
clstr = './outputs/vir_tests/mp_allorfs_cdhit9.faa.clstr'

In [3]:
micaout = './outputs/vir_tests/mp_allorfs_cdhit9_mica.out'

In [10]:
def id_virus_orfs(micaout, 
                  keywords1 = 'phage,virus,prophage,terminase,t4-like,lambda-like,mu-like,capsid,tail,fiber,lambdoid,portal,tail,virion,lysis,podovirus,podo-like,head,baseplate,myovirus,siphovirus,structural', 
                  keywords2 = 'integrase,transposase'):
    ''' takes mica output and returns dataframe with 1 if hits had viral signal, 0 if not
    Args:
        micaout (path): path to mica output file
        keywords1 (string): comma separated list of keywords to search for to confirm that orf is viral
        keywords2 (string): comma separated list of secondary keywords to search for to confirm that orf is viral
    Returns:
        pandas DataFrame of ORF classifications, 1 if viral, 0 if not (p1 for keywords1, p2 for kewords2)
    
    '''
    keywords1 = keywords1.split(",")
    keywords2 = keywords2.split(",")
    
    def search_phrases(string, keylist):
        for i, k in enumerate(keylist):
            if k in string:
                return True
            else:
                if i == len(keylist)-1:
                    return False
                else:
                    continue
    cnames = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles".split()
    df = pd.read_csv(micaout, sep="\t", names=cnames)

    p1_true = df[[search_phrases(i, keywords1) for i in df['salltitles']]]['qseqid'].unique()
    p2_true = df[[search_phrases(i, keywords2) for i in df['salltitles']]]['qseqid'].unique()
    qseqs = df['qseqid'].unique()

    newdf = pd.DataFrame(data = {'orf':qseqs,
                                'p1':[1 if q in p1_true else 0 for q in list(qseqs)],
                                'p2':[1 if q in p2_true else 0 for q in list(qseqs)]})
    return newdf

In [11]:
mica_phage_ids = id_virus_orfs(micaout)

In [12]:
mica_phage_ids

Unnamed: 0,orf,p1,p2
0,AG-891-A17_00001,0,0
1,AG-891-A17_00002,0,0
2,AG-891-A17_00003,0,0
3,AG-891-A17_00004,0,0
4,AG-891-A17_00005,0,0
5,AG-891-A17_00006,0,0
6,AG-891-A17_00007,0,0
7,AG-891-A17_00008,0,0
8,AG-891-A17_00009,0,0
9,AG-891-A17_00010,0,0


In [10]:
print(Counter(mica_phage_ids['p1']))
print(Counter(mica_phage_ids['p2']))

Counter({0: 8435, 1: 960})
Counter({0: 9376, 1: 19})


Good.

In [11]:
from nb_tools import cluster_map

In [12]:
cm = cluster_map(clstr)

In [13]:
cm_swap = {}

for c in cm: 
    for k in cm[c]: cm_swap[k] = c

In [14]:
cm_swap

{'AG-907-I10_00217': 'AG-892-P18_00259',
 'AG-909-A05_00329': 'AG-892-P18_00022',
 'AG-904-O13_00759': 'AG-895-P08_00425',
 'AG-904-O13_00703': 'AG-895-P08_00133',
 'AG-904-O13_01022': 'AG-895-P08_00729',
 'AG-904-O13_00629': 'AG-895-P08_00354',
 'AG-904-O13_00190': 'AG-895-P08_00297',
 'AG-903-F19_00353': 'AG-891-A17_00076',
 'AG-897-A15_00314': 'AG-893-J23_01205',
 'AG-904-O13_00194': 'AG-895-P08_00301',
 'AG-903-F19_00282': 'AG-891-A17_00340',
 'AG-907-I10_00029': 'AG-892-P18_00310',
 'AG-909-A05_00232': 'AG-892-P18_00457',
 'AG-909-A05_00887': 'AG-892-P18_00341',
 'AG-907-I10_00135': 'AG-892-P18_00131',
 'AG-907-I10_00183': 'AG-892-P18_00084',
 'AG-904-O13_00868': 'AG-891-A17_00701',
 'AG-904-O13_00472': 'AG-895-P08_00104',
 'AG-897-A15_00323': 'AG-893-J23_01271',
 'AG-897-A15_00320': 'AG-893-J23_01503',
 'AG-904-O13_00851': 'AG-891-A17_00476',
 'AG-897-A15_00249': 'AG-893-J23_00435',
 'AG-904-O13_00706': 'AG-895-P08_00130',
 'AG-909-A05_00832': 'AG-892-P18_00683',
 'AG-897-A15_000

In [15]:
names = ['contig','version','type','start','stop', 'dot','strand','val','comments']
adf = pd.read_csv(anns[0], comment='#', sep="\t", names=names)

In [16]:
with open(anns[0]) as ih:
    contigs = []
    orfs = []
    for l in ih:
        if l.startswith("#"): continue
        if len(l.split("\t")) != 9: continue
        contigs.append(l.split("\t")[0])
        orfs.append(l.split("\t")[-1].split(";")[0].replace("ID=",''))
adf = pd.DataFrame(data={'contig':contigs,'orf_id':orfs})
            

In [17]:
p1s = []
p2s = []
refs = []
for i,l in adf.iterrows():
    if l['orf_id'] in list(mica_phage_ids['orf']):
        rep = l['orf_id']
        ref = 1
    else:
        rep = cm_swap.get(l['orf_id'], None)
        ref = 2

    if rep is not None:
        mp_ids = mica_phage_ids[mica_phage_ids['orf'] == rep]
        p1 = mp_ids['p1'].values[0]
        p2 = mp_ids['p2'].values[0]

    else:
        mp_ids = pd.Series()
        p1 = 0
        p2=0
        ref = 1
        
    p1s.append(p1)
    p2s.append(p2)
    refs.append(ref)

In [18]:
jdf = pd.concat([adf, pd.DataFrame(data={'p1':p1s, 'p2':p2s, 'ref':refs})], axis=1)
contig_sum = pd.concat([jdf.groupby(['contig'])['p1'].sum(), jdf.groupby(['contig'])['p2'].sum(), jdf.groupby(['contig'])['orf_id'].count()], axis=1)

In [19]:
contig_sum

Unnamed: 0_level_0,p1,p2,orf_id
contig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AG-891-A17_NODE_1,3,0,166
AG-891-A17_NODE_10,0,0,38
AG-891-A17_NODE_11,0,0,20
AG-891-A17_NODE_12,0,0,22
AG-891-A17_NODE_13,0,0,29
AG-891-A17_NODE_14,0,0,22
AG-891-A17_NODE_15,0,0,13
AG-891-A17_NODE_16,0,0,14
AG-891-A17_NODE_17,0,0,11
AG-891-A17_NODE_18,0,0,14
