# Import and process metatranscriptomic occurences
## Using Dask to expedite the process

### If you don't have it already-- download the metatranscriptomic_occurances.tsv file from genoscope. 


In [None]:
%%bash

curl -L -O https://www.genoscope.cns.fr/tara/localdata/data/Geneset-v1/metatranscriptomic_occurrences.tsv.gz

### Setup dask and dask jobqueue to process the data and facilitate reading the table into memory
Note: here we are running the computations on a Slurm scheduler-- but you can easily adapt the code to work on any variety of job scheduler. See: http://jobqueue.dask.org/en/latest/index.html

In [None]:
# Start Dask jobqueue on Slurm 
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client

from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(processes=6, cores=6, memory="40GB",
                     queue='compute',
                     name="TaraEuk",
                     walltime='08:00:00')

# Initialize client clusters on the slurm queue
from dask.distributed import Client
cluster.start_workers(40)
client = Client(cluster)

### Read in Metatranscriptomic_occurences.tsv file from Carradec et al 2018. 

In [None]:
# read in tsv file to a dask dataframe file

metaT_file = '/vortexfs1/omics/alexander/data/TARA/processed_data/Geneset-v1/metatranscriptomic_occurrences.tsv' #chnage to the location of the metatranscriptomic occurances files
mdf = dd.read_table(metaT_file)
mdf.head()

### 1 ) Sum occurance data by sample location to get total FPKM per sample 

In [None]:
# groupby sample code and sum data and write to csv file 
smdf = mdf.groupby('sampleCode').sum()
pd_sample_sum = mdf.compute()
pd_sample_sum.drop('unigeneID', axis =1).to_csv('sample_abundance.tsv')

### 2) Get all Micromonas genes and their associated abundances 

In [None]:
# index mdf by unigeneIDs
imdf = mdf.set_index('unigeneID') #slow 

In [None]:
# read in list of micromonas-related genes from a grep search for micromonas in the taxonomy file; output to new file

microGenes = pd.read_table("micromonas.taxonomy.tsv")
microGeneId = list(microGenes.geneID)
micro_df = imdf.loc[microGeneId]
micro_pd = micro_df.compute()
micro_pd.to_csv('metaT-micromonas-ALL.csv')
