In [1]:
import pandas as pd
import os

In [2]:
counts_folder = './data/counts/TARA/'

In [3]:
# Load metadata dataframes and renames columns

# TARA metadata
metadata_file = "./data/TARA_metadata.csv"
metadata_df = pd.read_csv(metadata_file)
metadata_df = metadata_df.rename(columns = {'ENA-RUN':'dataset', "ENA-SPOT-COUNT":"dataset_size"})
metadata_df_slim = metadata_df[["dataset", "Station", "Longitude Start", "Latitude Start", "Depth", "Protocol Label", "dataset_size"]]

# Info of the genomes used for mapping
genome_info_tsv = './data/genomes_names.20170109.mgii_virus_size_corrected.tsv'
genome_info_df = pd.read_csv(genome_info_tsv, sep = '\t', index_col=0)

In [4]:
# Concatenates the different tsv files with the counts into a tidy dataframe
counts_df = pd.DataFrame()
for path, dirs, files in os.walk(counts_folder):
    for filename in files:
        dataset_name = filename.split('.')[0]
        file_path = os.path.join(path, filename)
        dataset_df = pd.read_csv(file_path, sep='\t', names=['genome_hash','count'], dtype={'genome_hash':str,'count':pd.np.int})
        dataset_df['dataset'] = dataset_name
        
        counts_df = pd.concat([counts_df, dataset_df])

In [5]:
# Add metadata columns
counts_df = counts_df.merge(metadata_df_slim, on='dataset', how='left')
counts_df = counts_df.merge(genome_info_df, on='genome_hash', how='left')

In [6]:
# Count normalization
counts_df['n_count'] = counts_df['count']/((counts_df['genome_size']/1000) * (counts_df['dataset_size']/2/1000000))

In [7]:
# Tags viral datasets
viral_criterion_fraction = counts_df['Protocol Label'].map(lambda x: x in {'GIRUS_NUC-dry_W0.1-0.22','VIRUS_NUC-DNA-Fe(20L)_W<-0.22','VIRUS_NUC-Fe_W<-0.22'})

In [8]:
# Name the Fraction of origin
counts_df.loc[viral_criterion_fraction,'Fraction'] = 'VIRUS'
counts_df.loc[viral_criterion_fraction == False,'Fraction'] = 'BACT'

In [9]:
# List to reorder dataframe
column_order = [u'genome_group',u'genome_hash', u'count', u'n_count', u'Fraction', u'dataset', u'Station', u'Latitude Start',
       u'Longitude Start', u'Depth', u'Protocol Label', u'dataset_size', u'genome_name',u'genome_size']

In [10]:
counts_df = counts_df[column_order]

In [11]:
counts_df.describe()

Unnamed: 0,count,n_count,Latitude Start,Longitude Start,Depth,dataset_size,genome_size
count,102543.0,102543.0,102543.0,102543.0,102543.0,102543.0,102543.0
mean,30044.76,0.884748,-3.038715,-31.783807,143.609023,93056160.0,320031.4
std,426750.5,6.843985,25.55676,73.07063,243.561214,65742570.0,663960.6
min,0.0,0.0,-62.2231,-159.046,5.0,2527066.0,503.0
25%,0.0,0.0,-21.0292,-96.0232,5.0,38042260.0,25611.0
50%,36.0,0.016568,-8.9109,-17.9099,30.0,52556400.0,70285.0
75%,695.0,0.181444,18.7341,32.898,120.0,164620000.0,161440.0
max,37795790.0,617.416016,43.7684,73.9067,1000.0,269556900.0,3148033.0


In [12]:
counts_df.to_csv('normalized_reads.tara.20170115.tsv', sep='\t')