In [42]:
import os 
import pandas as pd 
import numpy as np 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')
outdir = 'results/main/cmi_pb_datasets/processed/'
os.makedirs(outdir, exist_ok=True)

In [43]:
assays = ('cytof', 'olink', 'rnaseq')
longnames = ('live_cell_percentages', 'olink_prot_exp', 'rnaseq')

# Load the 2021 data

In [44]:
twentyone_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twentyone_data[metatable] = df
    
subjects = twentyone_data['subject']
specimen = twentyone_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twentyone_data['master_meta'] = master_meta

In [45]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    else:
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')        
    twentyone_data[assays[i]] = df
    

In [46]:
for assay, df in twentyone_data.items():
    print(assay, df.subject_id.nunique())

subject 36
specimen 36
master_meta 36
cytof 33
olink 36
rnaseq 28


In [68]:
# save cytof 
cytof_fn = os.path.join(outdir, 'cytof.2021.day0.pivoted.tsv.gz')
twentyone_data['cytof'].to_csv(cytof_fn, sep='\t')

# save olink
olink_fn = os.path.join(outdir, 'olink.2021.day0.pivoted.tsv.gz')
twentyone_data['olink'].to_csv(olink_fn, sep='\t')

# save rnaseq
rnaseq_fn = os.path.join(outdir, 'rnaseq.2021.day0.pivoted.tsv.gz')
twentyone_data['rnaseq'].to_csv(rnaseq_fn, sep='\t')

# save meta
meta_fn = os.path.join(outdir, 'meta.2021.day0.pivoted.tsv.gz')
twentyone_data['master_meta'].to_csv(meta_fn, sep='\t')

## Load the 2020 data

In [50]:
twenty_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twenty_data[metatable] = df
    
subjects = twenty_data['subject']
specimen = twenty_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twenty_data['master_meta'] = master_meta

In [51]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    else:
        # Still getting duplicate specimens? 
        df = df[df['unit'] == "Normalized Protein eXpression"]
        df = df[df['quality_control'] == 'Pass']
        df = df[df['protein_expression'] > df['lower_limit_of_quantitation']]
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')        
    twenty_data[assays[i]] = df
    

In [52]:
for assay, df in twenty_data.items():
    print(assay, df.subject_id.nunique())

subject 60
specimen 60
master_meta 60
cytof 22
olink 18
rnaseq 38


In [67]:
for assay, df in twenty_data.items():
    if assay in ['cytof', 'olink', 'rnaseq']:
        print(assay)
        for day, day_df in df.groupby('planned_day_relative_to_boost'):
            print(day)
            outfn = os.path.join(outdir, '{}.2020.day{}.pivoted.tsv.gz'.format(assay, day))
            day_df.to_csv(outfn, sep='\t')

cytof
0
1
3
7
14
olink
0
1
3
7
14
rnaseq
0
1
3
7
14


In [69]:
# summary21 = []
# for assay, df in twentyone_data.items():
#     summary21.append([assay, df.subject_id.nunique()])

# summary20 = []
# for assay, df in twenty_data.items():
#     summary20.append([df.subject_id.nunique()])

# summary = pd.concat([pd.DataFrame(summary21), pd.DataFrame(summary20)], axis=1)
# summary.columns = ('table', 'nsamples21', 'nsamples20')
# summary = summary[['table', 'nsamples20', 'nsamples21']]

In [74]:
# fn = 'results/main/cmi_pb_datasets/2020LD_{}.csv'.format('olink_prot_exp')
# df = pd.read_table(fn, sep=',')

## Standardize RNA-seq

In [None]:
# load gene data to find mitochondrial genes
gencode = pd.read_table('../../projects/dchallenge/results/refs/ensembl/gencode.v19.annotation.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'kind', 'gene_id', 'gene_name']
mito_genes = gencode.loc[gencode.chr == 'chrM']
mito_genes = set(mito_genes.gene_id.unique().tolist())

In [214]:
# find zero variance genes 
tmp = twenty_data['rnaseq']
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
genes_zv = tmpvars[(tmpvars == 0)]
genes_zv = set(genes_zv.index.tolist())
#num_zerovars = len(zeros) / twenty_data['rnaseq'].shape[1]

# create a set of bad genes
bad_genes = genes_zv.union(mito_genes)

# get a dataframe of just the rnaseq metadata
new_rnaseq1 = twenty_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]

# get a dataframe of just the rnaseq with non-zero genes and non-mito genes 
new_genes = twenty_data['rnaseq'].columns.str.match('ENSG')
new_genes = twenty_data['rnaseq'].columns[new_genes] 
new_genes = [x for x in new_genes if (x not in bad_genes) and (x in coding_genes)]
new_rnaseq2 = twenty_data['rnaseq'].loc[:, new_genes]

# make the final new_rnaseq table
new_rnaseq = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

In [229]:
tmp = new_rnaseq2.applymap(lambda x: x > 1)

In [230]:
tmp.sum(axis=1)

0      17896
1      17037
2      17884
3      17265
4      17783
       ...  
185    17070
186    16848
187    17253
188    16318
189    16979
Length: 190, dtype: int64

In [222]:
def check_lt_thresh(l, thresh):
    num_lt = 0
    for x in l:
        if x <= thresh:
            num_lt += 1
    return(num_lt)

In [225]:
new_rnaseq2.apply(lambda x: check_lt_thresh(x), axis=1, kwds={'thresh': 1})

TypeError: <lambda>() got an unexpected keyword argument 'kwds'

In [154]:
# stitch them together
twenty_data['rnaseq_std'] = 

In [155]:
twenty_data['rnaseq_std'] 

Unnamed: 0,subject_id,specimen_id,infancy_vac,biological_sex,year_of_birth,date_of_boost,actual_day_relative_to_boost,planned_day_relative_to_boost,ethnicity,race,...,ENSG00000284734,ENSG00000284735,ENSG00000284736,ENSG00000284737,ENSG00000284738,ENSG00000284740,ENSG00000284742,ENSG00000284744,ENSG00000284747,ENSG00000284748
0,1,1,wP,Female,1986-01-01,2016-09-12,-3,0,Not Hispanic or Latino,White,...,1.256,1.521,0.000,0.355,3.652,1.219,0.0,0.00,1.173,0.000
1,1,3,wP,Female,1986-01-01,2016-09-12,1,1,Not Hispanic or Latino,White,...,1.912,0.454,0.000,0.318,4.266,0.273,0.0,0.00,0.657,0.000
2,1,4,wP,Female,1986-01-01,2016-09-12,3,3,Not Hispanic or Latino,White,...,0.704,1.026,0.000,0.000,2.670,0.000,0.0,0.00,0.913,0.000
3,1,5,wP,Female,1986-01-01,2016-09-12,7,7,Not Hispanic or Latino,White,...,0.779,0.982,0.000,0.000,2.338,0.473,0.0,0.00,0.947,0.000
4,1,6,wP,Female,1986-01-01,2016-09-12,11,14,Not Hispanic or Latino,White,...,1.230,1.128,0.000,0.000,2.544,0.000,0.0,0.00,0.783,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,53,405,aP,Female,1998-01-01,2017-01-03,-28,0,Hispanic or Latino,Unknown or Not Reported,...,0.924,0.684,0.000,0.000,1.987,0.449,0.0,0.15,0.755,0.000
186,53,406,aP,Female,1998-01-01,2017-01-03,1,1,Hispanic or Latino,Unknown or Not Reported,...,1.584,0.898,0.000,0.000,1.898,0.405,0.0,0.00,1.071,0.101
187,53,407,aP,Female,1998-01-01,2017-01-03,3,3,Hispanic or Latino,Unknown or Not Reported,...,1.075,0.835,1.238,0.000,1.778,0.602,0.0,0.00,0.901,0.000
188,53,408,aP,Female,1998-01-01,2017-01-03,8,7,Hispanic or Latino,Unknown or Not Reported,...,0.583,0.336,0.000,0.000,1.579,0.404,0.0,0.00,0.486,0.000
