In [2]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import os.path as op
import os
import glob
import itertools
from nb_tools import id_virus_orfs, readfa, swap_cluster_map

In [7]:
micas = glob.glob("/mnt/scgc/simon/simonsproject/bats248_vs/blastout/*.out.gz")
clstr = "/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta.clstr"
input_fa = "/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs.fasta"

In [8]:
len(micas)

812

Going to construct one large data table with all orf assignments included.

In [12]:
tbls = []

for m in micas:
    tbls.append(id_virus_orfs(m))

In [14]:
cluster_calls = pd.concat(tbls)
cluster_calls.to_csv("/mnt/scgc/simon/simonsproject/bats248_vs/phage_hits.csv", index=False)

Now calculate the number of phage genes per contig per genome...

In [3]:
to_start =  '''AG−891−A17
AG−892−P18
AG−893−J23
AG−894−C07
AG−895−P08
AG−897−A15  
AG−903−F19
AG−903−I06
AG−904−O13
AG−907−C19
AG−907−I10
AG−908−F15
AG-909-A05
AG−910−E05
AG−912−O18
AG−913−C05
AG−913−C17'''.replace("−","-").split()

In [13]:
!head {clstr}

>Cluster 0
0	4505aa, >AG-891-C06_00787... at 1:4505:8885:13389/97.54%
1	2378aa, >AG-895-B10_00908... at 1:2378:11012:13389/96.38%
2	694aa, >AG-895-B10_01107... at 1:694:7969:8662/96.97%
3	2756aa, >AG-897-L14_00730... at 1:2756:10634:13389/99.38%
4	2537aa, >AG-907-L11_00957... at 1:2537:10853:13389/98.74%
5	13389aa, >AG-908-B04_00191... *
>Cluster 1
0	8694aa, >AG-913-A08_00038... *
>Cluster 2


In [9]:
def map_clstr_raw(clstr, singles=False):
    cluster_map = defaultdict(list)
    with open(clstr) as fh:
        for cluster_start, group in itertools.groupby(fh, lambda l: l[0] == '>'):
            members = []
            rep_seq = ''
            if not cluster_start: 
                for line in group:
                    if "*" in line: 
                        rep_seq = line.split(",")[1].split("...")[0].replace(">",'').replace(" ","")
                    else:
                        members.append(line.split(",")[1].split("...")[0].replace(">",'').replace(" ",""))
            if len(rep_seq) == 0:
                continue

            if singles:
                cluster_map[rep_seq] = members
            elif len(members) > 0:
                cluster_map[rep_seq] = members
            else:
                continue
    return cluster_map

In [11]:
cmap_raw = map_clstr_raw(clstr)

In [12]:
len(cmap_raw)

369008

In [13]:
scm = swap_cluster_map(cmap_raw)

In [15]:
len(scm)

3536875

In [30]:
sag = to_start[0]

In [34]:
sag
gff = "/mnt/scgc/simon/simonsproject/bats248_annotations/gff/{}.gff".format(sag)
faa = "/mnt/scgc/simon/simonsproject/bats248_annotations/faa/{}.faa".format(sag)

In [37]:
from nb_tools import orf_map

In [38]:
om = orf_map(gff)

In [43]:
om['lookup'] = [scm.get(i, i) for i in om['id']]

In [44]:
phage_counts = pd.read_csv("/mnt/scgc/simon/simonsproject/bats248_vs/phage_hits.csv")

In [45]:
len(phage_counts)

720345

In [49]:
sag_pcounts = pd.merge(phage_counts, om, left_on='orf', right_on='lookup', how='right').fillna(0)

In [50]:
omp = sag_pcounts[['contig','id','p1','p2','len']]

In [54]:
from recruitment_for_vs import summarize_by_contig

In [70]:
p1 = summarize_by_contig(omp, 'p1')
p2 = summarize_by_contig(omp, 'p2')
orf_count = pd.DataFrame(om.groupby('contig')['id'].count()).rename(columns={'id':'total_orfs'})

In [71]:
csum = pd.concat([p1, p2, orf_count], axis=1)

In [72]:
csum['viral_phage_gene_fraction'] = csum['p1'] / csum['total_orfs']
csum['viral2_phage_gene_fraction'] = csum['p2'] / csum['total_orfs']

In [73]:
csum

Unnamed: 0_level_0,p1,p2,total_orfs,viral_phage_gene_fraction,viral2_phage_gene_fraction
contig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AG-891-A17_NODE_1,3.0,0.0,166,0.018072,0.0
AG-891-A17_NODE_10,0.0,0.0,38,0.0,0.0
AG-891-A17_NODE_11,0.0,0.0,20,0.0,0.0
AG-891-A17_NODE_12,0.0,0.0,22,0.0,0.0
AG-891-A17_NODE_13,0.0,0.0,29,0.0,0.0
AG-891-A17_NODE_14,0.0,0.0,22,0.0,0.0
AG-891-A17_NODE_15,0.0,0.0,13,0.0,0.0
AG-891-A17_NODE_16,0.0,0.0,14,0.0,0.0
AG-891-A17_NODE_17,0.0,0.0,11,0.0,0.0
AG-891-A17_NODE_18,0.0,0.0,14,0.0,0.0


Looks like there are different numbers of ORF calls between the prodigal version that VS uses and the prodigal that prokka uses.  They are very similar counts but curiously not the same.

In [82]:
old_genes = "/mnt/scgc/simon/simonsproject/jb_vs_test/AG-891/AG-891-A17/prodigal/AG-891-A17_contigs_genes.fasta"

In [81]:
!ls {old_vs}

AG-891-A17_contigs.gbk		   AG-891-A17_contigs_proteins.fasta.fai
AG-891-A17_contigs_genes.fasta	   AG-891-A17_contigs.scores
AG-891-A17_contigs_proteins.fasta


In [83]:
names = []
for name, seq in readfa(open(old_genes)):
    names.append(name)

In [87]:
names = ["_".join(i.split()[0].split("_")[:-1]) for i in names]

In [89]:
Counter(names)

Counter({'AG-891-A17_NODE_1': 166,
         'AG-891-A17_NODE_10': 35,
         'AG-891-A17_NODE_11': 21,
         'AG-891-A17_NODE_12': 23,
         'AG-891-A17_NODE_13': 30,
         'AG-891-A17_NODE_14': 22,
         'AG-891-A17_NODE_15': 14,
         'AG-891-A17_NODE_16': 15,
         'AG-891-A17_NODE_17': 13,
         'AG-891-A17_NODE_18': 15,
         'AG-891-A17_NODE_19': 8,
         'AG-891-A17_NODE_2': 103,
         'AG-891-A17_NODE_20': 9,
         'AG-891-A17_NODE_21': 9,
         'AG-891-A17_NODE_22': 10,
         'AG-891-A17_NODE_23': 11,
         'AG-891-A17_NODE_24': 9,
         'AG-891-A17_NODE_25': 11,
         'AG-891-A17_NODE_26': 5,
         'AG-891-A17_NODE_27': 4,
         'AG-891-A17_NODE_28': 4,
         'AG-891-A17_NODE_29': 8,
         'AG-891-A17_NODE_3': 78,
         'AG-891-A17_NODE_30': 6,
         'AG-891-A17_NODE_31': 4,
         'AG-891-A17_NODE_32': 4,
         'AG-891-A17_NODE_33': 6,
         'AG-891-A17_NODE_34': 5,
         'AG-891-A17_NODE_35': 7,


In [99]:
from phage_count_table import phage_contig_table

In [92]:
gffs = glob.glob("/mnt/scgc/simon/simonsproject/bats248_annotations/gff/*.gff")

In [93]:
len(gffs)

6224

In [101]:
def phage_contig_table(clstr_map, gff, phage_hits_df, outfile=None):
    ''' create a summary of phage hits to contig orfs, mapping back cd-hit cluster seeds to orfs from individual genomes
    Args:
        clstr_map (dict): cluster map
        gff (path): path to gff output file for mapping orfs back to contigs
        phage_hits_df (pandas.DataFrame): phage hits dataframe from summary of all Mica results
        outfile (path): where to write output table, if None, none written
    Returns:
        pandas dataframe 
    '''
    cm = swap_cluster_map(clstr_map)
    om = orf_map(gff)
    om['lookup'] = [cm.get(i, i) for i in om['id']]
    omp = pd.merge(phage_hits_df, om, left_on='orf', right_on='lookup', how='right').fillna(0)[['contig','id','p1','p2','len']]
    orf_count = pd.DataFrame(om.groupby('contig')['id'].count()).rename(columns={'id':'total_orfs'})
    csum = pd.concat([summarize_by_contig(omp, 'p1'), summarize_by_contig(omp, 'p2'), orf_count], axis=1)
    csum['viral_phage_gene_fraction'] = csum['p1'] / csum['total_orfs']
    csum['viral2_phage_gene_fraction'] = csum['p2'] / csum['total_orfs']

    if outfile is not None:
        csum.to_csv(outfile)

    return csum

In [102]:
for g in gffs[:10]:
    sag = op.basename(g).split('.')[0]
    out_tbl = "./outputs/{}_phage_counts.csv".format(sag)
    phage_contig_table(scm, g, phage_counts, out_tbl)
