In [2]:
import pandas as pd
import os
import os.path as op
from nb_tools import alignment_coverage, create_fasta_index, diamond_view, cluster_map
import gzip

In [3]:
viruses = '''AG−891−A17
AG−892−P18
AG−893−J23
AG−894−C07
AG−895−P08
AG−897−A15  
AG−903−F19
AG−903−I06
AG−904−O13
AG−907−C19
AG−907−I10
AG−908−F15
AG-909-A05
AG−910−E05
AG−912−O18
AG−913−C05
AG−913−C17'''.replace("−","-").split()

In [4]:
linep = "./outputs/vir_tests/mp_allorfs_cdhit90-vs-linep.daa"
pov = "./outputs/vir_tests/mp_allorfs_cdhit90-vs-pov.daa"
fasta = "./outputs/vir_tests/mp_allorfs.faa"

In [5]:
fasta_index = create_fasta_index(fasta)

In [10]:
lpcov = alignment_coverage(linep, fasta_index, "./outputs/vir_tests/linep_vs_mp_allorfs.cov.gz")

In [18]:
!mv {lpcov} ./outputs/vir_tests/linep_vs_mp_allorfs.cov.gz

In [69]:
def mean_coverage(daa, fasta_index, out_cov, prefix = ''):
    if out_cov.endswith(".gz") == False:
        out_cov = "{}.gz".format(out_cov)
    
    if op.exists(out_cov):
        cov = out_cov
    else:
        cov = alignment_coverage(daa, fasta_index, out_cov)
    
    df = pd.read_csv(cov, sep="\t", names=['orf','position','coverage'])
    cov_per_orf = pd.Series(df.groupby('orf')['coverage'].mean(), name='mean_orf_cov_{}'.format(prefix))
    # len_per_orf = pd.Series(df.groupby('orf')['position'].max(), name='orf_len')
    #return pd.concat([cov_per_orf, len_per_orf], axis=1).reset_index()
    return cov_per_orf

In [73]:
cpo = mean_coverage(linep, fasta_index, "./outputs/vir_tests/linep_vs_mp_allorfs.cov.gz", 'linep')

In [117]:
ppo = mean_coverage(pov, fasta_index, "./outputs/vir_tests/pov_vs_mp_allorfs.cov.gz",'pov')

In [39]:
clstr = "./outputs/vir_tests/mp_allorfs_cdhit9.faa.clstr"

cm = cluster_map(clstr, singles=False)

In [75]:
for i,c in enumerate(cm):
    vals = cpo[c]
    print(vals)
    if i > 10:
        break

730.922053232
156.528023599
317.798969072
866.705882353
1228.48913043
13.8625954198
980.907444668
5.92070484581
42.1269035533
389.354609929
623.527446301
0.526315789474


In [11]:
gff = "/mnt/scgc/simon/simonsproject/bats248_annotations/gff/{}.gff".format(viruses[0])

In [37]:
def orf_map(gff):
    gdf = pd.read_csv(gff, comment='#', sep="\t", names = ['contig','app','type','start','stop','dot','strand','val','notes']).dropna()
    gdf['id'] = [i.split(";")[0].replace("ID=",'') for i in gdf['notes']]
    gdf['len'] = gdf['stop'] - gdf['start']
    return gdf[['contig','id','len']]

In [38]:
gdf = orf_map(gff)

In [41]:
def swap_cluster_map(cm):
    cm_swap = {}

    for c in cm: 
        for k in cm[c]: cm_swap[k] = c
    return cm_swap

In [42]:
cm_swap = swap_cluster_map(cm)

In [121]:
mean_cov_linep = []
mean_cov_pov = []

for i, l in gdf.iterrows():
    val = cpo.get([cm_swap.get(l['id'], l['id'])],0)
    pov_val = ppo.get([cm_swap.get(l['id'], l['id'])],0)
    mean_cov_linep.append(val[0])
    mean_cov_pov.append(pov_val[0])

gdf['linep_cov'] = mean_cov_linep
gdf['pov_cov'] = mean_cov_pov

In [122]:
gdf

Unnamed: 0,contig,id,len,linep_cov,pov_cov
0,AG-891-A17_NODE_1,AG-891-A17_00001,1358.0,682.909292,5.657080
1,AG-891-A17_NODE_1,AG-891-A17_00002,824.0,526.981752,1.277372
2,AG-891-A17_NODE_1,AG-891-A17_00003,917.0,693.488525,5.813115
3,AG-891-A17_NODE_1,AG-891-A17_00004,563.0,553.326203,0.973262
4,AG-891-A17_NODE_1,AG-891-A17_00005,716.0,629.172269,2.033613
5,AG-891-A17_NODE_1,AG-891-A17_00006,905.0,108.598007,1.744186
6,AG-891-A17_NODE_1,AG-891-A17_00007,176.0,103.896552,1.896552
7,AG-891-A17_NODE_1,AG-891-A17_00008,1328.0,168.613122,2.244344
8,AG-891-A17_NODE_1,AG-891-A17_00009,1361.0,502.690949,4.454746
9,AG-891-A17_NODE_1,AG-891-A17_00010,593.0,453.954315,2.467005


I think what I need to use next is something similar to Ben's 'compute_fr' function in graphsignals
```
Description:
#' Summarize abundance
#'
#' "Calculate the total number of hits per contig, and normalize that number to 
#' the number of reads in the metagenome, and the length of the contig 
#' (just as you did with Brandon in Swan et al., 2013)."
#'
#' 1. Contig names are defined in the names in the fasta file
#' > names(F)                                  
#' [1] "AAA164A08_contig00001_length59851" "AAAt64A08_contig00002_length17769" "AAA164A08_contig00003_length2465" 
#' [4] "AAA164A08_contig00004_length2005"      
#'                                             
#' 2. Number of  hits per contig are.. in SIM_TABLE        
#'                                             
#' 3. Number of reads in the metagenomes are... in [Similarity_N] under readsn=...
#'                                             
#' 4. Length of the contig is the number of characters in each contig in the fasta file
#' > len <- sapply(F, nchar)                   
#' > len                                       
#' AAA164A08_contig00001_length59851 AAA164A08_contig00002_length17769  AAA164A08_contig00003_length2465 
#'                             59851                            17769                              2465 
#'  AAA164A08_contig00004_length2005           
#'                              2005           
#'
#' 
#'                                   contig_len reads_POV reads_LineP hit_POV hit_LineP
#' AAA164A08_contig00001_length59851      59851   5922080     8279226   19431     10019
#' AAA164A08_contig00002_length17769      17769   5922080     8279226     795       299
#' AAA164A08_contig00003_length2465        2465   5922080     8279226     273        14
#' AAA164A08_contig00004_length2005        2005   5922080     8279226     573        26
#'
#' @param master the names of the contigs that must be present by row
#' @param sim_table the similarity table
#' @param cfg the configuration
#' @param mult the multipliers for the fraction
#' @param a matrix of fr ratios
```  

SIM table: Where/how does he construct that? ... It looks like it's from the tsvs and not the pileups.

read_sim function in graphsignals... default for function is to select the best hit... but does this make sense for these data?

In [11]:
linep_tsv = diamond_view(linep, "./outputs/vir_tests/linep_vs_mp_allorfs.tsv.gz")
pov_tsv = diamond_view(pov, "./outputs/vir_tests/pov_vs_mp_allorfs.tsv.gz")

In [12]:
!ls ./outputs/vir_tests/

linep_vs_mp_allorfs.cov.gz	 mp_allorfs_cdhit9_mica.out
linep_vs_mp_allorfs.tsv.gz	 mp_allorfs.faa
mp_allorfs_cdhit90-vs-linep.daa  mp_allorfs.faa.fai
mp_allorfs_cdhit90-vs-pov.daa	 pov_vs_mp_allorfs.cov.gz
mp_allorfs_cdhit9.faa		 pov_vs_mp_allorfs.tsv.gz
mp_allorfs_cdhit9.faa.clstr


In [1]:
def identity_filter(df, pctid=50.0, best_hit=True):
    df = df[df['pident'] >= pctid]
    df = df.sort_values(by=['qseqid', 'length','bitscore'], ascending=False).drop_duplicates(subset='qseqid', keep='first')
    return df

In [20]:
cnames = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split()
lpdf = identity_filter(pd.read_csv(linep_tsv, sep="\t", names=cnames), pctid=50)
povdf = identity_filter(pd.read_csv(pov_tsv, sep="\t", names=cnames), pctid=50)

In [24]:
lpsum = pd.Series(lpdf.groupby('sseqid')['qseqid'].count(), name='hit_LineP')
povsum = pd.Series(povdf.groupby('sseqid')['qseqid'].count(), name='hit_POV')

In [32]:
hits = pd.concat([lpsum, povsum], axis=1).reset_index().rename(columns={'index':'orf'})
len(hits)

8389

In [34]:
hits

Unnamed: 0,orf,hit_LineP,hit_POV,reads_POV,reads_LineP
0,AG-891-A17_00001,2099.0,42.0,5922080,8279226
1,AG-891-A17_00002,1160.0,6.0,5922080,8279226
2,AG-891-A17_00003,1592.0,29.0,5922080,8279226
3,AG-891-A17_00004,902.0,3.0,5922080,8279226
4,AG-891-A17_00005,1231.0,10.0,5922080,8279226
5,AG-891-A17_00006,236.0,9.0,5922080,8279226
6,AG-891-A17_00007,122.0,2.0,5922080,8279226
7,AG-891-A17_00008,643.0,18.0,5922080,8279226
8,AG-891-A17_00009,1612.0,32.0,5922080,8279226
9,AG-891-A17_00010,758.0,7.0,5922080,8279226


That was recruitment per ORF... now dig into one SAG to get recruitment per contig...

In [112]:
hits_linep = []
hits_pov = []

for i, l in gdf.iterrows():
    search_id = cm_swap.get(l['id'], l['id'])
    try:
        linep = hits[hits['orf'] == search_id]['hit_LineP'].values[0]
    except:
        linep=0
    try:
        pov = hits[hits['orf'] == search_id]['hit_POV'].values[0]
    except:
        pov = 0
        
    try:  
        hits_linep.append(linep)
        hits_pov.append(pov)
    except:
        print(linep)
        print(pov)

gdf['hit_LineP'] = hits_linep
gdf['hit_POV'] = hits_pov

In [113]:
orf_rec = gdf

In [114]:
orf_rec[:10]

Unnamed: 0,contig,id,len,hit_LineP,hit_POV
0,AG-891-A17_NODE_1,AG-891-A17_00001,1358.0,2099.0,42.0
1,AG-891-A17_NODE_1,AG-891-A17_00002,824.0,1160.0,6.0
2,AG-891-A17_NODE_1,AG-891-A17_00003,917.0,1592.0,29.0
3,AG-891-A17_NODE_1,AG-891-A17_00004,563.0,902.0,3.0
4,AG-891-A17_NODE_1,AG-891-A17_00005,716.0,1231.0,10.0
5,AG-891-A17_NODE_1,AG-891-A17_00006,905.0,236.0,9.0
6,AG-891-A17_NODE_1,AG-891-A17_00007,176.0,122.0,2.0
7,AG-891-A17_NODE_1,AG-891-A17_00008,1328.0,643.0,18.0
8,AG-891-A17_NODE_1,AG-891-A17_00009,1361.0,1612.0,32.0
9,AG-891-A17_NODE_1,AG-891-A17_00010,593.0,758.0,7.0


In [115]:
def summarize_by_contig(df, hitscol):
    return pd.Series(df.groupby('contig')[hitscol].sum(), name=hitscol)

In [116]:
bcdf = pd.concat([summarize_by_contig(orf_rec, 'hit_LineP'), summarize_by_contig(orf_rec, 'hit_POV')], axis=1)
bcdf['reads_POV'] = 5922080
bcdf['reads_LineP'] = 8279226

OK, now we've hypothetically got a SIM table similar to Ben's for SAG AG-891-A17... now to calculate the fr

```
#' Summarize abundance
#'
#' "Calculate the total number of hits per contig, and normalize that number to 
#' the number of reads in the metagenome, and the length of the contig 
#' (just as you did with Brandon in Swan et al., 2013)."
#'
#' 1. Contig names are defined in the names in the fasta file
#' > names(F)                                  
#' [1] "AAA164A08_contig00001_length59851" "AAAt64A08_contig00002_length17769" "AAA164A08_contig00003_length2465" 
#' [4] "AAA164A08_contig00004_length2005"      
#'                                             
#' 2. Number of  hits per contig are.. in SIM_TABLE        
#'                                             
#' 3. Number of reads in the metagenomes are... in [Similarity_N] under readsn=...
#'                                             
#' 4. Length of the contig is the number of characters in each contig in the fasta file
#' > len <- sapply(F, nchar)                   
#' > len                                       
#' AAA164A08_contig00001_length59851 AAA164A08_contig00002_length17769  AAA164A08_contig00003_length2465 
#'                             59851                            17769                              2465 
#'  AAA164A08_contig00004_length2005           
#'                              2005           
#'
#' 
#'                                   contig_len reads_POV reads_LineP hit_POV hit_LineP
#' AAA164A08_contig00001_length59851      59851   5922080     8279226   19431     10019
#' AAA164A08_contig00002_length17769      17769   5922080     8279226     795       299
#' AAA164A08_contig00003_length2465        2465   5922080     8279226     273        14
#' AAA164A08_contig00004_length2005        2005   5922080     8279226     573        26
#'
#' @param master the names of the contigs that must be present by row
#' @param sim_table the similarity table
#' @param cfg the configuration
#' @param mult the multipliers for the fraction
#' @param a matrix of fr ratios
compute_fr <- function(master = MASTERNAMES,
   sim_table = SIM_TABLE,
   cfg = CFG, mult = 1e6){
   
   verify_vector <- function(v, master, missing_value = NA){
      ix <- master %in% names(v)
      if (any(!ix)) v[master[!ix]] <- missing_value
      v
   }
   # we may have vectors short of one or more elements in master
   # so we add the missing ones and populated them with NA
   sim_table <- lapply(sim_table, function(x) lapply(x, verify_vector, master=master))
   
   contig_len <- SUMMARY[['contig_length']]
   contig_names <- names(contig_len)
   N <- length(contig_names)
   ABD <- list()

   sim_sets <- names(sim_table)
   for (isim in sim_sets){
      nsim <- length(sim_table[[isim]])
      sim_names <- names(sim_table[[isim]])
      read_names <- paste0("reads", 1:nsim)
      reads <- as.numeric(unlist(cfg[[isim]][read_names]))
      names(reads) <- sim_names
      reads <- lapply(reads, function(x,n) {x <- rep(x,n) ; names(x) <- contig_names; x}, N) 
      hits <- sim_table[[isim]]
      fr <- lapply(sim_names, function(n) {
         data.frame(hits = hits[[n]][contig_names],
            reads = reads[[n]][contig_names],
            fr = hits[[n]][contig_names]/(reads[[n]][contig_names] * contig_len[contig_names]) * mult)
      })
      names(fr) <- sim_names
      ABD[[isim]] <- data.frame( do.call(cbind, fr),
         row.names = contig_names, stringsAsFactors = FALSE)
   }     
   do.call(cbind, ABD)
}
```

In [117]:
tbl = bcdf

def contig_lengths(gff):
    ''' create a dict with contig names as keys and lengths as values from gff file'''
    outdict = {}
    with open(gff) as ih:
        for l in ih:
            if l.startswith("##sequence-region"):
                vec = l.strip().split()
                outdict[vec[1]] = vec[-1]
    return outdict

def compute_fr(tbl, gff, mult=1e6):
    
    clens = contig_lengths(gff)
    tbl['contig_length'] = [float(clens[i]) for i in tbl['contig']]
    
    hits_cols = [i for i in tbl.columns if 'hit' in i]
    count_cols = ["_".join(["reads",i.split("_")[1]]) for i in hits_cols]
    
    for h, c in zip(hits_cols, count_cols):
        fr = tbl[h]/(tbl[c] * tbl['contig_length']) * mult
        tbl[h.replace("hit_","fr_")] = fr
    return tbl

In [118]:
out_test = compute_fr(bcdf.reset_index().rename(columns={'index':'contig'}), gff)

In [119]:
out_test

Unnamed: 0,contig,hit_LineP,hit_POV,reads_POV,reads_LineP,contig_length,fr_LineP,fr_POV
0,AG-891-A17_NODE_1,192777.0,4612.0,5922080,8279226,147339.0,0.158033,0.005286
1,AG-891-A17_NODE_10,25223.0,728.0,5922080,8279226,32598.0,0.093458,0.003771
2,AG-891-A17_NODE_11,47459.0,1078.0,5922080,8279226,24711.0,0.231974,0.007366
3,AG-891-A17_NODE_12,38205.0,560.0,5922080,8279226,22815.0,0.20226,0.004145
4,AG-891-A17_NODE_13,25833.0,672.0,5922080,8279226,22000.0,0.141828,0.005158
5,AG-891-A17_NODE_14,20013.0,415.0,5922080,8279226,20404.0,0.11847,0.003434
6,AG-891-A17_NODE_15,28917.0,640.0,5922080,8279226,19050.0,0.183345,0.005673
7,AG-891-A17_NODE_16,26117.0,516.0,5922080,8279226,17004.0,0.185516,0.005124
8,AG-891-A17_NODE_17,743.0,67.0,5922080,8279226,13103.0,0.006849,0.000863
9,AG-891-A17_NODE_18,11774.0,115.0,5922080,8279226,12781.0,0.111268,0.001519


fr values aren't lining up... something is different.  Just noticed that Ben takes the best hit from a tsv... which I'm not sure makes sense, but I'll try it per SAG anyway...

In [66]:
h = 'hit_LineP'
c = 'reads_LineP'
clens = contig_lengths(gff)

In [81]:
tbl = bcdf.reset_index().rename(columns={'index':'contig'})
tbl['contig_length'] = [float(clens[i]) for i in tbl['contig']]

In [82]:
mult = 1e6
fr = (tbl[h]/tbl[c] * tbl['contig_length'])
#fr = (tbl[h]/tbl[c])

Testing how these functions perform with viruscope outputs...

In [153]:
bac_tsv = '/mnt/scgc/simon/simonsproject/jb_vs_test/AG-891/AG-891-A17/diamond/LineP-all.tsv.gz'
vir_tsv = '/mnt/scgc/simon/simonsproject/jb_vs_test/AG-891/AG-891-A17/diamond/POV.tsv.gz'

bac_df = identity_filter(pd.read_csv(bac_tsv, names=cnames, sep="\t"))
vir_df = identity_filter(pd.read_csv(vir_tsv, names=cnames, sep="\t"))

bac_sum = pd.Series(bac_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-bac')
vir_sum = pd.Series(vir_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-vir')

orfhits = pd.concat([bac_sum, vir_sum], axis=1).reset_index().rename(columns={'index':'orf'})
orfhits['contig'] = ["_".join(i.split("_")[:-1]) for i in orfhits['orf']]

chits = pd.concat([summarize_by_contig(orfhits, 'hit_mg-bac'), summarize_by_contig(orfhits, 'hit_mg-vir')], axis=1)

chits
chits['reads_mg-vir'] = 5922080
chits['reads_mg-bac'] = 8279226

chits = chits.reset_index()

test_out2 = compute_fr(chits, gff, mult=1e6)

In [171]:
test_out2

Unnamed: 0,contig,hit_LineP,hit_POV,reads_POV,reads_LineP,contig_length,fr_LineP,fr_POV
0,AG-891-A17_NODE_1,196626.0,4734.0,5922080,8279226,147339.0,0.161188,0.005425
1,AG-891-A17_NODE_10,28369.0,771.0,5922080,8279226,32598.0,0.105115,0.003994
2,AG-891-A17_NODE_11,49721.0,1116.0,5922080,8279226,24711.0,0.24303,0.007626
3,AG-891-A17_NODE_12,37227.0,549.0,5922080,8279226,22815.0,0.197082,0.004063
4,AG-891-A17_NODE_13,26633.0,688.0,5922080,8279226,22000.0,0.14622,0.005281
5,AG-891-A17_NODE_14,20605.0,426.0,5922080,8279226,20404.0,0.121974,0.003525
6,AG-891-A17_NODE_15,30209.0,664.0,5922080,8279226,19050.0,0.191537,0.005886
7,AG-891-A17_NODE_16,27029.0,539.0,5922080,8279226,17004.0,0.191995,0.005353
8,AG-891-A17_NODE_17,1640.0,109.0,5922080,8279226,13103.0,0.015118,0.001405
9,AG-891-A17_NODE_18,15298.0,204.0,5922080,8279226,12781.0,0.144571,0.002695


These values look comparable to the previous calculation, but not to Ben's calculations for fr values.

In [179]:
linepdf = identity_filter(pd.read_csv(linep_tsv, names=cnames, sep="\t")).sort_values(by=['qseqid', 'length','bitscore'], ascending=False).drop_duplicates(subset='qseqid', keep='first')
povdf = identity_filter(pd.read_csv(pov_tsv, names=cnames, sep="\t")).sort_values(by=['qseqid', 'length','bitscore'], ascending=False).drop_duplicates(subset='qseqid', keep='first')

In [180]:
lpsum = pd.Series(linepdf.groupby('sseqid')['qseqid'].count(), name='hit_LineP')
povsum = pd.Series(povdf.groupby('sseqid')['qseqid'].count(), name='hit_POV')
orfhits = pd.concat([lpsum, povsum], axis=1).reset_index().rename(columns={'index':'orf'})

# TODO: Here, if contig fasta is a gff, map orfs back to contigs, else, assume that contig is part of orf name.
orfhits['contig'] = ["_".join(i.split("_")[:-1]) for i in orfhits['orf']]
chits = pd.concat([summarize_by_contig(orfhits, 'hit_LineP'), summarize_by_contig(orfhits, 'hit_POV')], axis=1)
chits['reads_POV'] = 5922080
chits['reads_LineP'] = 8279226
chits = chits.reset_index()
test_out3 = compute_fr(chits, gff, mult=1e6)

In [181]:
test_out3

Unnamed: 0,contig,hit_LineP,hit_POV,reads_POV,reads_LineP,contig_length,fr_LineP,fr_POV
0,AG-891-A17_NODE_1,44265.0,4208.0,5922080,8279226,147339.0,0.036287,0.004823
1,AG-891-A17_NODE_10,5766.0,669.0,5922080,8279226,32598.0,0.021365,0.003465
2,AG-891-A17_NODE_11,12122.0,993.0,5922080,8279226,24711.0,0.059251,0.006786
3,AG-891-A17_NODE_12,8555.0,487.0,5922080,8279226,22815.0,0.045291,0.003604
4,AG-891-A17_NODE_13,5673.0,635.0,5922080,8279226,22000.0,0.031146,0.004874
5,AG-891-A17_NODE_14,4447.0,384.0,5922080,8279226,20404.0,0.026325,0.003178
6,AG-891-A17_NODE_15,8548.0,587.0,5922080,8279226,19050.0,0.054198,0.005203
7,AG-891-A17_NODE_16,6930.0,430.0,5922080,8279226,17004.0,0.049226,0.00427
8,AG-891-A17_NODE_17,422.0,89.0,5922080,8279226,13103.0,0.00389,0.001147
9,AG-891-A17_NODE_18,3190.0,163.0,5922080,8279226,12781.0,0.030146,0.002154


This table matches the graph signals fractions output.... So the old version of viruscope was dropping duplicate hits.  Does this make sense considering what viruscope is calculating?  I don't really think so... but this is how the prediction algorithm was trained.

Making a function out of this table making:

In [10]:
def import_diamond_tsv(tsv, pctid=50.0, best_hit=True):
    df = pd.read_csv(tsv, 
                     names="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(), 
                     sep="\t")
    df = df[df['pident'] >= pctid]
    if best_hit:
        df = df.sort_values(by=['qseqid', 'length','bitscore'], ascending=False).drop_duplicates(subset='qseqid', keep='first')
    return df


def summarize_by_contig(df, hitscol):
    return pd.Series(df.groupby('contig')[hitscol].sum(), name=hitscol)


def contig_lengths(infile):
    ''' create a dict with contig names as keys and lengths as values from gff file'''
    outdict = {}
    if "g" in infile.split(".")[-1]:
        filetype = 'gff'
        print("looks like input contig file is in gff format.")
    elif "f" in infile.split(".")[-1]:
        filetype = 'fasta'
        print("looks like input config fiel is in fasta format.")
    else:
        raise IOError("can't figure out what kind of file contig file is.  Make sure it's either in fasta or gff format.")
    if filetype == 'gff':
        with open(gff) as ih:
            for l in ih:
                if l.startswith("##sequence-region"):
                    vec = l.strip().split()
                    outdict[vec[1]] = vec[-1]
    
    elif filetype == 'fasta':
        for name, seq in read_fasta(open(infile)):
            outdict[name] = len(seq)  
    return outdict


def compute_fr(tbl, clens, mult=1e6):
    '''
    Args:
        tbl: output stats table with mg hit and read counts from diamond recruitment
        clens: dict of contig lengths
        mult: factor to multiply fraction by to make readiable
        
    Outputs:
        pandas DataFrame with mg_fr values calculated
    '''

    tbl['contig_length'] = [float(clens[i]) for i in tbl['contig']]
    
    hits_cols = [i for i in tbl.columns if 'hit' in i]
    count_cols = ["_".join(["reads",i.split("_")[1]]) for i in hits_cols]
    
    for h, c in zip(hits_cols, count_cols):
        fr = tbl[h]/(tbl[c] * tbl['contig_length']) * mult
        tbl[h.replace("hit_","fr_")] = fr
    return tbl

def weighted_score(
    x = [0.901, 0.317, 0.653, 0.423, 0.000, 0.419, 0.299, 0.917, 0.195], 
    lut = {'0': -1e6, '2': 0.2, '3': 0.3, '5': 0.5}):
    
    
    """
    Score a set of values according to a weighted look-up-table
    
    Final score is cumulative in the sense that a score is the sum of 
    of a values score and all of the possible lower scores
    
    @param x numeric vector, of values in the range of 0-1
    @param lut dict, look up tables where items are cut-offs between
      weights and keys (when converted to numeric) are the weights
    """
    
    #weights
    kys = sorted(lut.keys())
    w = [float(k) for k in kys]
    
    #values
    v = [lut[k] for k in kys]
    
    # indices
    ix = [bisect_left(v, xi) - 1 for xi in x]
    
    # we find the cumulative weights
    cumweights = np.cumsum(np.asarray(w)).tolist()
    
    # and then assign weighted scores using those
    ws = [cumweights[i] for i in ix]
    
    return ws

In [None]:
def map_orfs_to_contigs(tbl, contig_file):
    if "g" is in contig_file.split(".")[-1]:
        print("Contig file looks to be a gff file, so we will map orfs back to contigs using this.")
        gff = True
    else:
        print("Since the contig file doesn't look like a fasta file, we will assume that the contig name is embedded in the orf name.")
        gff = False
    if gff:
        

In [13]:
bac_tsv = '/mnt/scgc/simon/simonsproject/jb_vs_test/AG-891/AG-891-A17/diamond/LineP-all.tsv.gz'
vir_tsv = '/mnt/scgc/simon/simonsproject/jb_vs_test/AG-891/AG-891-A17/diamond/POV.tsv.gz'

def construct_recruit_tbl(vir_tsv, bac_tsv, read_count_dict, contig_file):
    '''
    Args:
        vir_tsv: diamond recruitment converted to tsv for vir metagenome
        bac_tsv: diamond recruitment converted to tsv for bac metagenome
        read_count_dict: dict of mg read counts with two keys -- 'vir_reads' and 'bac_reads'
        contig_file: path to a file with sag contigs in it; either in fasta or gff format
    Returns:
        pandas dataframe with mg fraction calculated
    '''
    cnames = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split()
    bac_df = import_diamond_tsv(bac_tsv)
    vir_df = import_diamond_tsv(vir_tsv)

    bac_sum = pd.Series(bac_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-bac')
    vir_sum = pd.Series(vir_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-vir')

    orfhits = pd.concat([bac_sum, vir_sum], axis=1).reset_index().rename(columns={'index':'orf'})
    orfhits['contig'] = ["_".join(i.split("_")[:-1]) for i in orfhits['orf']]

    chits = pd.concat([summarize_by_contig(orfhits, 'hit_mg-bac'), summarize_by_contig(orfhits, 'hit_mg-vir')], axis=1)

    chits['reads_mg-vir'] = read_count_dict['vir_reads']
    chits['reads_mg-bac'] = read_count_dict['bac_reads']

    clens = contig_lengths(contig_file)
    
    out_tbl = compute_fr(chits.reset_index(), clens, mult=1e6)
    return out_tbl

In [9]:
def import_diamond_tsv(tsv, pctid=50.0, best_hit=True):
    df = pd.read_csv(tsv, 
                     names="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(), 
                     sep="\t")
    df = df[df['pident'] >= pctid]
    if best_hit:
        df = df.sort_values(by=['qseqid', 'length','bitscore'], ascending=False).drop_duplicates(subset='qseqid', keep='first')
    return df


def summarize_by_contig(df, hitscol):
    return pd.Series(df.groupby('contig')[hitscol].sum(), name=hitscol)


def contig_lengths(infile):
    ''' create a dict with contig names as keys and lengths as values from gff file'''
    outdict = {}
    if "g" in infile.split(".")[-1]:
        filetype = 'gff'
        print("looks like input contig file is in gff format.")
    elif "f" in infile.split(".")[-1]:
        filetype = 'fasta'
        print("looks like input config fiel is in fasta format.")
    else:
        raise IOError("can't figure out what kind of file contig file is.  Make sure it's either in fasta or gff format.")
    if filetype == 'gff':
        with open(infile) as ih:
            for l in ih:
                if l.startswith("##sequence-region"):
                    vec = l.strip().split()
                    outdict[vec[1]] = vec[-1]
    
    elif filetype == 'fasta':
        for name, seq in read_fasta(open(infile)):
            outdict[name] = len(seq)  
    return outdict


def compute_fr(tbl, clens, mult=1e6):
    '''
    Args:
        tbl: output stats table with mg hit and read counts from diamond recruitment
        clens: dict of contig lengths
        mult: factor to multiply fraction by to make readiable
        
    Outputs:
        pandas DataFrame with mg_fr values calculated
    '''

    tbl['contig_length'] = [float(clens[i]) for i in tbl['contig']]
    
    hits_cols = [i for i in tbl.columns if 'hit' in i]
    count_cols = ["_".join(["reads",i.split("_")[1]]) for i in hits_cols]
    
    for h, c in zip(hits_cols, count_cols):
        fr = tbl[h]/(tbl[c] * tbl['contig_length']) * mult
        tbl[h.replace("hit_","fr_")] = fr
    return tbl

def orf_map(gff):
    gdf = pd.read_csv(gff, comment='#', sep="\t", names = ['contig','app','type','start','stop','dot','strand','val','notes']).dropna()
    gdf['orf'] = [i.split(";")[0].replace("ID=",'') for i in gdf['notes']]
    gdf['len'] = gdf['stop'] - gdf['start']
    return gdf[['contig','orf']]


def map_orfs_to_contigs(df, contig_file):
    if "g" in contig_file.split(".")[-1]:
        gff = True
        print("looks like input contig file is in gff format.  Will map ORFs to contigs using that.")
    else:
        print("doesn't look like input contig file is in gff format.  Will assume that contig name is embedded in the ORF name.")
        gff = False
    
    if gff:
        gdf = orf_map(contig_file)
        return pd.merge(df, gdf, on='orf', how='outer')
    else:
        df['contig'] = ["_".join(i.split("_")[:-1]) for i in orfhits['orf']]
        return df
    

def construct_recruit_tbl(vir_tsv, bac_tsv, read_count_dict, contig_file):
    '''
    Args:
        vir_tsv: diamond recruitment converted to tsv for vir metagenome
        bac_tsv: diamond recruitment converted to tsv for bac metagenome
        read_count_dict: dict of mg read counts with two keys -- 'vir_reads' and 'bac_reads'
        contig_file: path to a file with sag contigs in it; either in fasta or gff format
    Returns:
        pandas dataframe with mg fraction calculated
    '''
    cnames = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split()
    bac_df = import_diamond_tsv(bac_tsv)
    vir_df = import_diamond_tsv(vir_tsv)

    bac_sum = pd.Series(bac_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-bac')
    vir_sum = pd.Series(vir_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-vir')

    orfhits = pd.concat([bac_sum, vir_sum], axis=1).reset_index().rename(columns={'index':'orf'})
    orfhits = map_orfs_to_contigs(orfhits, contig_file)
    
    chits = pd.concat([summarize_by_contig(orfhits, 'hit_mg-bac'), summarize_by_contig(orfhits, 'hit_mg-vir')], axis=1)
    chits['reads_mg-vir'] = read_count_dict['vir_reads']
    chits['reads_mg-bac'] = read_count_dict['bac_reads']
    
    clens = contig_lengths(contig_file)
    
    out_tbl = compute_fr(chits.reset_index(), clens, mult=1e6)
    
    return out_tbl

In [10]:
test = construct_recruit_tbl(vir_tsv, bac_tsv, {'vir_reads':5922080, 'bac_reads':8279226}, gff)

NameError: name 'gff' is not defined

In [5]:
vir_tsv = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/AG-920-P23_vs_POV.tsv.gz'
bac_tsv = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/AG-920-P23_vs_LineP-all.tsv.gz'
contig_file = '/mnt/scgc/simon/simonsproject/bats248_annotations/gff/AG-920-P23.gff'
read_count_dict = {'vir_reads':5922080, 'bac_reads':8279226}

In [11]:
test = construct_recruit_tbl(vir_tsv, bac_tsv, {'vir_reads':5922080, 'bac_reads':8279226}, contig_file)

looks like input contig file is in gff format.  Will map ORFs to contigs using that.
looks like input contig file is in gff format.


In [12]:
test

Unnamed: 0,contig,hit_mg-bac,hit_mg-vir,reads_mg-vir,reads_mg-bac,contig_length,fr_mg-bac,fr_mg-vir
0,AG-920-P23_NODE_1,19017.0,1034.0,5922080,8279226,40342.0,0.056937,0.004328
1,AG-920-P23_NODE_10,3231.0,2004.0,5922080,8279226,20469.0,0.019066,0.016532
2,AG-920-P23_NODE_11,456.0,152.0,5922080,8279226,19396.0,0.00284,0.001323
3,AG-920-P23_NODE_12,9616.0,495.0,5922080,8279226,19372.0,0.059956,0.004315
4,AG-920-P23_NODE_13,11797.0,564.0,5922080,8279226,17554.0,0.081172,0.005425
5,AG-920-P23_NODE_14,7478.0,262.0,5922080,8279226,16992.0,0.053156,0.002604
6,AG-920-P23_NODE_15,2347.0,81.0,5922080,8279226,16247.0,0.017448,0.000842
7,AG-920-P23_NODE_16,5751.0,535.0,5922080,8279226,15004.0,0.046296,0.006021
8,AG-920-P23_NODE_17,3839.0,213.0,5922080,8279226,14796.0,0.031339,0.002431
9,AG-920-P23_NODE_18,4807.0,216.0,5922080,8279226,13825.0,0.041997,0.002638


In [18]:
cnames = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split()
bac_df = import_diamond_tsv(bac_tsv)
vir_df = import_diamond_tsv(vir_tsv)

bac_sum = pd.Series(bac_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-bac')
vir_sum = pd.Series(vir_df.groupby('sseqid')['qseqid'].count(), name='hit_mg-vir')

orfhits = pd.concat([bac_sum, vir_sum], axis=1).reset_index().rename(columns={'index':'orf'})
orfhits['contig'] = ["_".join(i.split("_")[:-1]) for i in orfhits['orf']]

In [19]:
chits = pd.concat([summarize_by_contig(orfhits, 'hit_mg-bac'), summarize_by_contig(orfhits, 'hit_mg-vir')], axis=1)

chits['reads_mg-vir'] = read_count_dict['vir_reads']
chits['reads_mg-bac'] = read_count_dict['bac_reads']

clens = contig_lengths(contig_file)

looks like input contig file is in gff format.


In [25]:
clens

{'AG-920-P23_NODE_1': '40342',
 'AG-920-P23_NODE_10': '20469',
 'AG-920-P23_NODE_11': '19396',
 'AG-920-P23_NODE_12': '19372',
 'AG-920-P23_NODE_13': '17554',
 'AG-920-P23_NODE_14': '16992',
 'AG-920-P23_NODE_15': '16247',
 'AG-920-P23_NODE_16': '15004',
 'AG-920-P23_NODE_17': '14796',
 'AG-920-P23_NODE_18': '13825',
 'AG-920-P23_NODE_19': '13588',
 'AG-920-P23_NODE_2': '30853',
 'AG-920-P23_NODE_20': '12652',
 'AG-920-P23_NODE_21': '12282',
 'AG-920-P23_NODE_22': '11823',
 'AG-920-P23_NODE_23': '11602',
 'AG-920-P23_NODE_24': '10453',
 'AG-920-P23_NODE_25': '9147',
 'AG-920-P23_NODE_26': '8945',
 'AG-920-P23_NODE_27': '8678',
 'AG-920-P23_NODE_28': '8367',
 'AG-920-P23_NODE_29': '7443',
 'AG-920-P23_NODE_3': '27364',
 'AG-920-P23_NODE_30': '7218',
 'AG-920-P23_NODE_31': '6995',
 'AG-920-P23_NODE_32': '6821',
 'AG-920-P23_NODE_33': '6339',
 'AG-920-P23_NODE_34': '5714',
 'AG-920-P23_NODE_35': '5445',
 'AG-920-P23_NODE_36': '5402',
 'AG-920-P23_NODE_37': '5006',
 'AG-920-P23_NODE_38': '

In [21]:
out_tbl = compute_fr(chits.reset_index(), clens, mult=1e6)

KeyError: 'AG-920-P23'

In [22]:
tbl = chits.reset_index()

In [23]:
tbl

Unnamed: 0,contig,hit_mg-bac,hit_mg-vir,reads_mg-vir,reads_mg-bac
0,AG-920-P23,199711.0,14606.0,5922080,8279226


In [None]:
tbl['contig_length'] = [float(clens[i]) for i in tbl['contig']]

hits_cols = [i for i in tbl.columns if 'hit' in i]
count_cols = ["_".join(["reads",i.split("_")[1]]) for i in hits_cols]

for h, c in zip(hits_cols, count_cols):
    fr = tbl[h]/(tbl[c] * tbl['contig_length']) * mult
    tbl[h.replace("hit_","fr_")] = fr
return tbl

In [16]:
from nb_tools import readfa