In [1]:
import pandas as pd
import os
import glob

In [2]:
os.getcwd()

'/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/jupyter-notebooks'

In [3]:
os.chdir('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/data/metaT_trimmed_reads/' \
            'fasta_files/paired/mRNA')
diamond_taxon = 'dino_metzyme_annotated_coassembly_diamond_out.tsv'

In [4]:
full_col_names = ['Query ID', 'Subject ID', 'Percentage of identical matches', 'Alignment length', 
             'Number of mismatches', 'Number of gap openings', 'Start of alignment in query', 
             'End of alignment in query', 'Start of alignment in subject', 'End of alignment in subject',
             'Expected value', 'Bit score']

short_col_names = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']

df_coassembly = pd.read_csv(diamond_taxon, sep='\t', names=short_col_names)

In [5]:
df_coassembly.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,NODE_1_length_22561_cov_36.839973_g0_i0_72_1721_-,FBALC1_15187-NZ_ABHI01000001,95.0,543,27,0,1,543,1,543,0.0,1028.0
1,NODE_3_length_16954_cov_20.440858_g2_i0_1_3114_+,FBALC1_13442-NZ_ABHI01000001,91.1,708,63,0,330,1037,1,708,0.0,1292.0
2,NODE_4_length_16807_cov_24.028322_g1_i1_1_642_+,ORF0203-A168I18DRAFT_NOD,74.2,209,54,0,4,212,21,229,1.05e-106,313.0
3,NODE_5_length_16271_cov_37.097901_g3_i0_3_3659_-,ORF0200-A168I18DRAFT_NOD,61.1,1227,466,6,1,1219,1,1224,0.0,1501.0
4,NODE_6_length_16015_cov_31.066483_g4_i0_23_502_-,Favella_taraikaensis_FeNarragansettBay-2014022...,76.1,155,37,0,1,155,1,155,7.669999999999999e-77,238.0


In [6]:
set(df_coassembly['sseqid'])

{'MMETSP1338-20131121|6648_1',
 'MMETSP0191-20121206|17181_1',
 'MMETSP1058-20130122|36109_1',
 'BZARG_2586-GCA_000224335.2_contig00094',
 'MMETSP1451-20131203|17404_1',
 'Turpa_3587-Contig189',
 'MMETSP1439-20131203|70800_1',
 'CC_3127-NC_002696',
 'Karlodinium-micrum-CCMP2283-20140214|8097_1',
 'MMETSP0208-20121228|4623_1',
 'Karenia-brevis-SP3-20130916|15909_1',
 'MMETSP0784-20121206|61767_1',
 'MMETSP1176-20130426|8522_1',
 'MMETSP1338-20131121|9767_1',
 'O71_10419-GCA_000277005.1_contig70',
 'AciPR4_1797-NC_014963',
 'Kryptoperidinium-foliaceum-CCMP1326-20130916|247120_1',
 'Symbiodinium-sp-C1-20140214|81997_1',
 'CCA19390-FR824115ENA1',
 'Prorocentrum-minimum-CCMP1329-20131001|262934_1',
 'MMETSP0448-20130528|125034_1',
 'MMETSP0190_2-20130828|24720_1',
 'MMETSP0467-20121206|9038_1',
 'Karlodinium-micrum-CCMP2283-20140214|112361_1',
 'ORF05897-TG_ggo_6089',
 'Oxyrrhis-marina-LB1974-20131105|4282_1',
 'Gura_3468-NC_009483',
 'jgi_16357_43.m000162-Auran1',
 'Karenia-brevis-CCMP2229

In [11]:
### Modified from Logan Whitehouse's lab, 
# https://github.com/Lswhiteh/phylodbannotation/blob/master/fastaannotation.py

taxonomy_file = "/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/databases/PhyloDB/phylodb_1.076.taxonomy.txt"
gene_file = "/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/databases/PhyloDB/phylodb_1.076.annotations.txt"

tax_dict = {}
gene_dict = {}

with open(taxonomy_file) as taxfile:
    for line in taxfile:
        row = line.strip().split("\t")
        tax_dict[row[0]] = row[1:]

with open(gene_file) as genefile:
    for line in genefile:
        row = line.strip().split("\t")
        gene_dict[row[0]] = row[1:]

In [13]:
gene_mapping = df_coassembly['sseqid'].map(gene_dict)
## Massage data to get just the second column (organism classification)
organism_df = pd.DataFrame(gene_mapping.values.tolist())[1]
taxonomy_mapping = organism_df.map(tax_dict)
tax_df = pd.DataFrame(taxonomy_mapping.values.tolist())[1]
#splitting up taxonomy into individual groupings
tax_df = pd.DataFrame(tax_df.str.split(';').tolist())
tax_df.columns = ['Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Strain_name']
df_coassembly_tax = pd.concat([df_coassembly, tax_df], axis=1)

In [267]:
dmd_file = diamond_taxon[:-4]
df_coassembly_tax.to_csv(dmd_file+'_taxonomy.tsv', sep='\t')

In [14]:
df_coassembly_tax[df_coassembly_tax['Phylum'] == 'Dinophyta']

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,Domain,Kingdom,Phylum,Class,Order,Family,Genus,Strain_name


In [23]:
phylodb_tax = glob.glob('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/'\
                        'databases/PhyloDB/*taxonomy*')[0]

In [27]:
df_phylodb = pd.read_csv(phylodb_tax, sep="\t", compression='gzip')

In [68]:
phylodb_pep = glob.glob('/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/'\
                        'databases/PhyloDB/*pep*')[0]

In [72]:
df = pd.read_csv(phylodb_pep, sep="\t", compression='gzip')