Initial testing to see if I can put together reasonable contig recruitment summaries using recruitment onto the clustered ORFs.

Result: Attempts to use clustered values for recruitment metrics.  The problem is that contig coverage uses each read once per contig, while recruitment is on a per-orf basis, so if we used recruitment to seed sequences, we'd need to collect all reads recruited per ORF and them combine them with all other reads recruited to all other ORFs on the contig, remove duplicate reads and then count them.

The size of indexed values for this would be huge for some ORFs due to high levels of read recruitment, I don't think I can use the pandas dataframe or python dict format, and just building the index takes a lont time.  My only thought is maybe use a sqlite database, but even that would take a while to build and the amount of time to develop that would be as much as just re-running recruitment on all 6500 BATS SAGs (i.e. 8 days).

In [4]:
from nb_tools import readfa
from collections import defaultdict
import itertools
import pandas as pd

seed_fa = '/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta'
seeds = set()

for name, seq in readfa(open(seed_fa)):
    seeds.add(name.split()[0])

In [8]:
seedlist = list(seeds)
sags = [i.split("_")[0] for i in seeds]
contig = ['_'.join(i.split("_")[:-1]) for i in seeds]

In [13]:
df = pd.DataFrame(data={'sag':sags,'seed':seedlist})

In [19]:
from recruitment_for_vs import import_diamond_tsv

In [41]:
sag = 'AG-910-A14'
pov = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_POV.tsv.gz'.format(sag=sag)
pdf = import_diamond_tsv(pov, best_hit=False)
coi = list(df[df['sag'] == sag]['seed'])
for s in coi:
    print(s, ','.join(list(set(pdf[pdf['sseqid'] == s]['qseqid']))), sep="\t")

AG-910-A14_00470	CAM_READ_0536693163,CAM_READ_0539067083,CAM_READ_0531501787,CAM_READ_0530852005,CAM_READ_0539618667,CAM_READ_0534079639,CAM_READ_0538240941,CAM_READ_0535195633,CAM_READ_0533103759,CAM_READ_0541409127,CAM_READ_0540721993,CAM_READ_0539932225,CAM_READ_0532029745,CAM_READ_0533141579,CAM_READ_0533218937,CAM_READ_0533278693,CAM_READ_0530245931,CAM_READ_0540954445,CAM_READ_0541210607,CAM_READ_0540062769,CAM_READ_0534649719,CAM_READ_0537537537,CAM_READ_0538478021,CAM_READ_0537359121,CAM_READ_0533094703,CAM_READ_0532559423,CAM_READ_0537041701,CAM_READ_0537498793,CAM_READ_0537161673,CAM_READ_0535061233,CAM_READ_0541058725,CAM_READ_0537707973,CAM_READ_0541014919,CAM_READ_0534803127,CAM_READ_0538222301,CAM_READ_0539525667,CAM_READ_0537261739,CAM_READ_0539787877,CAM_READ_0533440015,CAM_READ_0531530631,CAM_READ_0530387857,CAM_READ_0539181029,CAM_READ_0538721663,CAM_READ_0539576775,CAM_READ_0535802319,CAM_READ_0538533211,CAM_READ_0533026141,CAM_READ_0537256889,CAM_READ_0540163641,CAM

AG-910-A14_00616	CAM_READ_0541566909,CAM_READ_0541536097,CAM_READ_0541626453
AG-910-A14_00402	
AG-910-A14_00408	CAM_READ_0541682035,CAM_READ_0534966841,CAM_READ_0530336913,CAM_READ_0534838861,CAM_READ_0533204903,CAM_READ_0531999513,CAM_READ_0530234683
AG-910-A14_00400	
AG-910-A14_00279	CAM_READ_0541676613,CAM_READ_0541496061,CAM_READ_0533468571
AG-910-A14_00143	
AG-910-A14_00135	
AG-910-A14_00635	
AG-910-A14_00146	CAM_READ_0534962967
AG-910-A14_00493	
AG-910-A14_00132	
AG-910-A14_00617	
AG-910-A14_00644	CAM_READ_0541544515,CAM_READ_0540606515,CAM_READ_0533142417,CAM_READ_0533564243,CAM_READ_0532545781,CAM_READ_0535953073,CAM_READ_0529740519,CAM_READ_0533383333,CAM_READ_0533724705,CAM_READ_0541539899,CAM_READ_0537245891,CAM_READ_0541589283,CAM_READ_0533264463,CAM_READ_0535910165,CAM_READ_0540864403,CAM_READ_0540143509,CAM_READ_0533218595,CAM_READ_0536510181,CAM_READ_0539585337,CAM_READ_0533545761,CAM_READ_0535912127,CAM_READ_0541628131,CAM_READ_0541666399,CAM_READ_0529729043,CAM_READ_05

In [45]:
outdict = {}   
for sag in list(set(sags))[:3]:
    pov = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_POV.tsv.gz'.format(sag=sag)
    linep = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_LineP-all.tsv.gz'.format(sag=sag)

    pov = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_POV.tsv.gz'.format(sag=sag)
    pdf = import_diamond_tsv(pov, best_hit=False)

    coi = list(df[df['sag'] == sag]['seed'])
    for s in coi:
        outdict[s] = list(set(pdf[pdf['sseqid'] == s]['qseqid']))

In [46]:
outdict

{'AG-899-B20_00002': [],
 'AG-899-B20_00015': [],
 'AG-899-B20_00020': ['CAM_READ_0541595359',
  'CAM_READ_0530043255',
  'CAM_READ_0529986261'],
 'AG-899-B20_00064': [],
 'AG-904-O14_00035': [],
 'AG-904-O14_00036': [],
 'AG-904-O14_00037': [],
 'AG-904-O14_00038': ['CAM_READ_0541684099', 'CAM_READ_0541648397'],
 'AG-904-O14_00101': ['CAM_READ_0541546613',
  'CAM_READ_0541574331',
  'CAM_READ_0541600045',
  'CAM_READ_0531971031',
  'CAM_READ_0541607959',
  'CAM_READ_0535959013',
  'CAM_READ_0532882183',
  'CAM_READ_0533588993',
  'CAM_READ_0541683321',
  'CAM_READ_0541533339',
  'CAM_READ_0534570361',
  'CAM_READ_0533605159',
  'CAM_READ_0539989749',
  'CAM_READ_0532824153',
  'CAM_READ_0541561789'],
 'AG-904-O14_00102': [],
 'AG-904-O14_00154': ['CAM_READ_0541532343',
  'CAM_READ_0541621697',
  'CAM_READ_0541508959',
  'CAM_READ_0541683957',
  'CAM_READ_0532293009',
  'CAM_READ_0541620705',
  'CAM_READ_0533428321',
  'CAM_READ_0535921353',
  'CAM_READ_0540222839',
  'CAM_READ_0531898

In [47]:
outdict = {}   
for sag in list(set(sags))[:3]:
    pov = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_POV.tsv.gz'.format(sag=sag)
    linep = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_LineP-all.tsv.gz'.format(sag=sag)

    pdf = import_diamond_tsv(linep, best_hit=False)

    coi = list(df[df['sag'] == sag]['seed'])
    for s in coi:
        outdict[s] = list(set(pdf[pdf['sseqid'] == s]['qseqid']))

In [49]:
import sqlite3

conn = sqlite3.connect("./outputs/sqlite3_practice.db")

In [50]:
c = conn.cursor()

In [51]:
c.execute("CREATE TABLE linep (contig, hit, score)")

<sqlite3.Cursor at 0x2aab16868dc0>

In [54]:
coi[0]

'AG-899-B20_00020'

In [None]:
for sag in list(set(sags))[:3]:
    pov = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_POV.tsv.gz'.format(sag=sag)
    linep = '/mnt/scgc/simon/simonsproject/bats248_vs/diamond/pergenome/{sag}_vs_LineP-all.tsv.gz'.format(sag=sag)

    pdf = import_diamond_tsv(linep, best_hit=False)

    coi = list(df[df['sag'] == sag]['seed'])
    for s in coi:
        c.execute('INSERT INTO linep VALUES ("{s}", "{seqlist}", 0)'.format(s = s, seqlist = ",".join(list(set(pdf[pdf['sseqid'] == s]['qseqid'])))))

Based on initial time it took to set up recruitment to those three SAGs in the database, I think building the database will take a while.

In [58]:
c.execute('SELECT hit from linep where contig="{id}"'.format(id=coi[0]))

<sqlite3.Cursor at 0x2aab16868dc0>

In [59]:
output = c.fetchone()

In [60]:
output[0]

'Sequence0000145734,Sequence0000260669,Sequence0000082441,Sequence0000092303,Sequence0000095880,Sequence0000088104,Sequence0000108472,Sequence0000225736,Sequence0000232479,Sequence0000149091,Sequence0000114925,Sequence0000130889,Sequence0000107969,Sequence0000291686,Sequence0000020185,Sequence0000239281,Sequence0000137270,Sequence0000098208,Sequence0000157172,Sequence0000219930,Sequence0000122435,Sequence0000018405,Sequence0000206334,Sequence0000115148,Sequence0000001346,Sequence0000001670,Sequence0000126180,Sequence0000151762,Sequence0000166354,Sequence0000228282,Sequence0000112783,Sequence0000073732,Sequence0000209219,Sequence0000164464,Sequence0000042544,Sequence0000144090,Sequence0000106083,Sequence0000018850,Sequence0000150417,Sequence0000015099,Sequence0000106921,Sequence0000122052,Sequence0000099307,Sequence0000064553,Sequence0000158503,Sequence0000017067,Sequence0000015486,Sequence0000065363,Sequence0000059692,Sequence0000039809,Sequence0000200832,Sequence0000156160,Sequence000