In [2]:
import pandas as pd
import os
import glob

In [3]:
os.getcwd()

'/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/jupyter-notebooks'

In [4]:
os.chdir('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/data/metaT_trimmed_reads/' \
            'fasta_files/paired/mRNA/diamond_output')
diamond_taxon = 'dino_metzyme_annotated_coassembly_diamond_out.tsv'

In [5]:
full_col_names = ['Query ID', 'Subject ID', 'Percentage of identical matches', 'Alignment length', 
             'Number of mismatches', 'Number of gap openings', 'Start of alignment in query', 
             'End of alignment in query', 'Start of alignment in subject', 'End of alignment in subject',
             'Expected value', 'Bit score']

short_col_names = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']

df_coassembly = pd.read_csv(diamond_taxon, sep='\t', names=short_col_names)

In [9]:
df_coassembly.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,NODE_1_length_22561_cov_36.839973_g0_i0_72_1721_-,FBALC1_15187-NZ_ABHI01000001,95.0,543,27,0,1,543,1,543,0.0,1028.0
1,NODE_3_length_16954_cov_20.440858_g2_i0_1_3114_+,FBALC1_13442-NZ_ABHI01000001,91.1,708,63,0,330,1037,1,708,0.0,1292.0
2,NODE_4_length_16807_cov_24.028322_g1_i1_1_642_+,ORF0203-A168I18DRAFT_NOD,74.2,209,54,0,4,212,21,229,1.05e-106,313.0
3,NODE_5_length_16271_cov_37.097901_g3_i0_3_3659_-,ORF0200-A168I18DRAFT_NOD,61.1,1227,466,6,1,1219,1,1224,0.0,1501.0
4,NODE_6_length_16015_cov_31.066483_g4_i0_23_502_-,Favella_taraikaensis_FeNarragansettBay-2014022...,76.1,155,37,0,1,155,1,155,7.669999999999999e-77,238.0


In [6]:
### Modified from Logan Whitehouse's lab, 
# https://github.com/Lswhiteh/phylodbannotation/blob/master/fastaannotation.py

taxonomy_file = "/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/databases/PhyloDB/phylodb_1.076.taxonomy.txt"
gene_file = "/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/databases/PhyloDB/phylodb_1.076.annotations.txt"

tax_dict = {}
gene_dict = {}

with open(taxonomy_file) as taxfile:
    for line in taxfile:
        row = line.strip().split("\t")
        tax_dict[row[0]] = row[1:]

with open(gene_file) as genefile:
    for line in genefile:
        row = line.strip().split("\t")
        gene_dict[row[0]] = row[1:]

In [24]:
gene_mapping = df_coassembly['sseqid'].map(gene_dict)
## Massage data to get just the second column (organism classification)
organism_df = pd.DataFrame(gene_mapping.values.tolist())[1]
taxonomy_mapping = organism_df.map(tax_dict)
tax_df = pd.DataFrame(taxonomy_mapping.values.tolist())[1]

df_coassembly_tax = pd.concat([df_coassembly, tax_df], axis=1)
df_coassembly_tax = df_coassembly_tax.rename({1: 'taxonomy'}, axis=1)

In [32]:
df_coassembly_tax.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,taxonomy
0,NODE_1_length_22561_cov_36.839973_g0_i0_72_1721_-,FBALC1_15187-NZ_ABHI01000001,95.0,543,27,0,1,543,1,543,0.0,1028.0,Bacteria;Bacteroidetes/Chlorobi group;Flavobac...
1,NODE_3_length_16954_cov_20.440858_g2_i0_1_3114_+,FBALC1_13442-NZ_ABHI01000001,91.1,708,63,0,330,1037,1,708,0.0,1292.0,Bacteria;Bacteroidetes/Chlorobi group;Flavobac...
2,NODE_4_length_16807_cov_24.028322_g1_i1_1_642_+,ORF0203-A168I18DRAFT_NOD,74.2,209,54,0,4,212,21,229,1.05e-106,313.0,Bacteria;Proteobacteria;Gammaproteobacteria;Mi...
3,NODE_5_length_16271_cov_37.097901_g3_i0_3_3659_-,ORF0200-A168I18DRAFT_NOD,61.1,1227,466,6,1,1219,1,1224,0.0,1501.0,Bacteria;Proteobacteria;Gammaproteobacteria;Mi...
4,NODE_6_length_16015_cov_31.066483_g4_i0_23_502_-,Favella_taraikaensis_FeNarragansettBay-2014022...,76.1,155,37,0,1,155,1,155,7.669999999999999e-77,238.0,Eukaryota;Alveolata;Ciliophora;Spirotrichea;Ch...


In [33]:
dmd_file = diamond_taxon[:-4]
df_coassembly_tax.to_csv(dmd_file+'_taxonomy.tsv', sep='\t')

In [37]:
df_coassembly_tax[df_coassembly_tax['taxonomy'].str.contains('Dinophyta')]

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,taxonomy
227,NODE_26_length_11492_cov_18.457578_g19_i0_2_57...,Karenia-brevis-SP1-20130916|289615_1,33.7,1063,639,24,628,1654,3,1035,1.010000e-146,491.0,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
231,NODE_14_length_13360_cov_26.370005_g9_i0_1698_...,Oxyrrhis-marina-20131105|151477_1,44.0,427,192,3,1,427,1,380,2.230000e-109,341.0,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
301,NODE_42_length_9922_cov_24.243123_g4_i1_2_1073_-,Karenia-brevis-SP1-20130916|45901_1,65.8,342,113,2,20,357,93,434,1.080000e-159,464.0,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
442,NODE_55_length_9298_cov_13.604855_g19_i1_2_4708_-,Karenia-brevis-SP1-20130916|289615_1,33.7,1063,639,24,267,1293,3,1035,8.550000e-149,491.0,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
449,NODE_55_length_9298_cov_13.604855_g19_i1_4759_...,Karenia-brevis-SP1-20130916|289615_1,33.7,1063,639,24,211,1237,3,1035,3.320000e-149,491.0,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
720451,NODE_1961869_length_78_cov_2.724832_g1955307_i...,Kryptoperidinium-foliaceum-CCMP1326-20130916|1...,84.0,25,4,0,1,25,45,69,5.310000e-04,41.6,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
720455,NODE_1962240_length_74_cov_2.055172_g1955678_i...,Ceratium-fusus-PA161109-20140214|182283_1,90.9,22,2,0,1,22,7,28,3.300000e-05,42.7,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
720458,NODE_1962484_length_71_cov_2.323944_g1955922_i...,Durinskia-baltica-CSIRO_CS-38-20140214|100697_1,69.6,23,7,0,1,23,129,151,8.050000e-05,43.9,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...
720460,NODE_1962392_length_72_cov_2.405594_g1955830_i...,Karlodinium-micrum-CCMP2283-20140214|26356_1,77.3,22,5,0,2,23,81,102,4.360000e-04,40.8,Eukaryota;Alveolata;Dinophyta;Dinophyceae;Dino...


In [39]:
df_coassembly_dino = df_coassembly_tax[df_coassembly_tax['taxonomy'].str.contains('Dinophyta')]
df_coassembly_dino.to_csv(dmd_file+'_dinophyta.tsv', sep='\t')

### Looking at different PhyloDB tables

In [23]:
phylodb_tax = glob.glob('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/'\
                        'databases/PhyloDB/*taxonomy*')[0]

In [27]:
df_phylodb = pd.read_csv(phylodb_tax, sep="\t", compression='gzip')

In [68]:
phylodb_pep = glob.glob('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/'\
                        'databases/PhyloDB/*pep*')[0]

In [72]:
df = pd.read_csv(phylodb_pep, sep="\t", compression='gzip')