In [1]:
import pandas as pd
import os

In [2]:
counts_folder = './data/counts/eilat/'

In [3]:
# Load metadata dataframes and renames columns

# TARA metadata
metadata_file = "./data/eilat_metadata.20170109.tsv"
metadata_df = pd.read_csv(metadata_file)

# Info of the genomes used for mapping
genome_info_tsv = './data/genomes_names.20170109.mgii_virus_size_corrected.tsv'
genome_info_df = pd.read_csv(genome_info_tsv, sep = '\t', index_col=0)

In [4]:
# Concatenates the different tsv files with the counts into a tidy dataframe
counts_df = pd.DataFrame()
for path, dirs, files in os.walk(counts_folder):
    for filename in files:
        dataset_name = filename.split('.')[0]
        file_path = os.path.join(path, filename)
        dataset_df = pd.read_csv(file_path, sep='\t', names=['genome_hash','count'])#, dtype={'genome_hash':str,'count':pd.np.int})
        dataset_df['dataset'] = dataset_name
        
        counts_df = pd.concat([counts_df, dataset_df])

In [5]:
# Add metadata columns
counts_df = counts_df.merge(metadata_df, on='dataset', how='left')
counts_df = counts_df.merge(genome_info_df, on='genome_hash', how='left')

In [6]:
counts_df.head()

Unnamed: 0,genome_hash,count,dataset,toal_read_count,PE_read_count,total_base_count,Longitude,Latitude,Fraction,Depth,index,genome_group,genome_name,genome_size
0,c84cc511d5c752080560170a686fbd8d,11799,EG06,49660188.0,24830094.0,2507839000.0,29.53,34.97,gDNA,5.0,gDNA_6_final,Cyanophage,gi|326783659|ref|NC_015287.1| Synechococcus ph...,232878
1,790dffecf0995b43d78fa92e5cd1e364,0,EG06,49660188.0,24830094.0,2507839000.0,29.53,34.97,gDNA,5.0,gDNA_6_final,Euryarchaeota,gi|932822996|dbj|BBBE01000013.1| Thermococcus ...,71728
2,e3085b25fedc1064bd16b67cc16c723c,0,EG06,49660188.0,24830094.0,2507839000.0,29.53,34.97,gDNA,5.0,gDNA_6_final,Euryarchaeota,gi|932822980|dbj|BBBE01000029.1| Thermococcus ...,2100
3,374d112a90d61db8c233e986d45061dd,4322,EG06,49660188.0,24830094.0,2507839000.0,29.53,34.97,gDNA,5.0,gDNA_6_final,MGII,contig__157656,67929
4,ec6bc2d0a8055d648d0cde3779f38377,85,EG06,49660188.0,24830094.0,2507839000.0,29.53,34.97,gDNA,5.0,gDNA_6_final,Cyanophage,gi|472340512|ref|NC_020847.1| Cyanophage MED4-...,38327


In [7]:
# Count normalization
counts_df['n_count'] = counts_df['count']/((counts_df['genome_size']/1000) * (counts_df['PE_read_count']/1000000))

In [8]:
# Tags viral datasets
viral_criterion_fraction = counts_df['dataset'].map(lambda x: x[1] == 'V')

In [9]:
# Name the Fraction of origin
counts_df.loc[viral_criterion_fraction,'Fraction'] = 'VIRUS'
counts_df.loc[viral_criterion_fraction == False,'Fraction'] = 'BACT'

In [10]:
counts_df['Station'] = "Eilat"

In [11]:
# List to reorder dataframe
column_order = [u'genome_group',u'genome_hash', u'count', u'n_count', u'Fraction', u'dataset', u'Station', u'Latitude',
       u'Longitude', u'Depth', u'PE_read_count', u'genome_name',u'genome_size']

In [12]:
counts_df = counts_df[column_order]

In [13]:
counts_df.describe()

Unnamed: 0,count,n_count,Latitude,Longitude,Depth,PE_read_count,genome_size
count,3084.0,3084.0,3084.0,3084.0,3084.0,3084.0,3084.0
mean,93086.36,1.897679,34.97,29.53,5.0,33507620.0,320031.4
std,814772.1,11.271321,7.10658e-15,1.065987e-14,0.0,9620542.0,664065.0
min,0.0,0.0,34.97,29.53,5.0,21070710.0,503.0
25%,0.0,0.0,34.97,29.53,5.0,24172640.0,25611.0
50%,17.0,0.008332,34.97,29.53,5.0,34139630.0,70285.0
75%,1082.0,0.23321,34.97,29.53,5.0,41789020.0,161440.0
max,22999250.0,233.673538,34.97,29.53,5.0,46772100.0,3148033.0


In [14]:
counts_df.to_csv('normalized_reads.eilat.20170115.tsv', sep='\t')