In [2]:
import os 
import pandas as pd 
import numpy as np 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')
outdir = 'results/main/cmi_pb_datasets/processed/'
os.makedirs(outdir, exist_ok=True)

In [3]:
assays = ('cytof', 'olink', 'rnaseq')
longnames = ('live_cell_percentages', 'olink_prot_exp', 'rnaseq')

In [4]:
# loading gencode data
gencode = pd.read_table('results/refs/gencode/gencode.v38lift37.annotation.protein_coding.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']
gencode['gene_nonversioned_id'] = gencode['gene_id'].str.replace('\..*', '')

## Load the 2020 data

In [32]:
twenty_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twenty_data[metatable] = df
    
subjects = twenty_data['subject']
specimen = twenty_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twenty_data['master_meta'] = master_meta

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2020.pivoted.tsv.gz')
twenty_data['master_meta'].to_csv(meta_fn, sep='\t')

In [None]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    else:
        # Still getting duplicate specimens? 
        df = df[df['unit'] == "Normalized Protein eXpression"]
        df = df[df['quality_control'] == 'Pass']
        df = df[df['protein_expression'] > df['lower_limit_of_quantitation']]
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')        
    twenty_data[assays[i]] = df
    

In [None]:
for assay, df in twenty_data.items():
    print(assay, df.subject_id.nunique())

subject 60
specimen 60
master_meta 60
cytof 22
olink 18
rnaseq 38


In [None]:
for assay, df in twenty_data.items():
    if assay in ['cytof', 'olink', 'rnaseq']:
        print(assay)
        for day, day_df in df.groupby('planned_day_relative_to_boost'):
            print(day)
            outfn = os.path.join(interm_dir, '{}.2020.day{}.pivoted.tsv.gz'.format(assay, day))
            day_df.to_csv(outfn, sep='\t')

cytof
0
1
3
7
14
olink
0
1
3
7
14
rnaseq
0
1
3
7
14


# Load the 2021 data

In [None]:
twentyone_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twentyone_data[metatable] = df
    
subjects = twentyone_data['subject']
specimen = twentyone_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twentyone_data['master_meta'] = master_meta

In [None]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
                
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    else:
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')        
    twentyone_data[assays[i]] = df
    

In [None]:
for assay, df in twentyone_data.items():
    print(assay, df.subject_id.nunique())

In [None]:
interm_dir = os.path.join(outdir, 'full/')
os.makedirs(interm_dir, exist_ok=True)

# save cytof 
cytof_fn = os.path.join(interm_dir, 'cytof.2021.day0.pivoted.tsv.gz')
twentyone_data['cytof'].to_csv(cytof_fn, sep='\t')

# save olink
olink_fn = os.path.join(interm_dir, 'olink.2021.day0.pivoted.tsv.gz')
twentyone_data['olink'].to_csv(olink_fn, sep='\t')

# save rnaseq
rnaseq_fn = os.path.join(interm_dir, 'rnaseq.2021.day0.pivoted.tsv.gz')
twentyone_data['rnaseq'].to_csv(rnaseq_fn, sep='\t')

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2021.pivoted.tsv.gz')
twentyone_data['master_meta'].to_csv(meta_fn, sep='\t')

In [None]:
# summary21 = []
# for assay, df in twentyone_data.items():
#     summary21.append([assay, df.subject_id.nunique()])

# summary20 = []
# for assay, df in twenty_data.items():
#     summary20.append([df.subject_id.nunique()])

# summary = pd.concat([pd.DataFrame(summary21), pd.DataFrame(summary20)], axis=1)
# summary.columns = ('table', 'nsamples21', 'nsamples20')
# summary = summary[['table', 'nsamples20', 'nsamples21']]

In [None]:
# fn = 'results/main/cmi_pb_datasets/2020LD_{}.csv'.format('olink_prot_exp')
# df = pd.read_table(fn, sep=',')

## Standardize RNA-seq

The RNA-seq dataset includes a lot more variables and processing so there are many steps we are including for standardization:

    1) remove zero variance genes,
    2) remove mitochondrial genes, 
    3) keep expressed genes (defined as those with TPM > 1) in more than a certain proportion of people (cut_filter)
    4) intersect genes across 2020 and 2021 and finally make harmonized 2020 and 2021 tables 

In [None]:
# load gene data to find mitochondrial genes
mito_genes = gencode.loc[(gencode.chr == 'chrM'), 'gene_nonversioned_id']
mito_genes = set(mito_genes.unique().tolist())

In [None]:
# value used across both RNA-seq datasets
cut_filter = 0.3

### RNA-seq for 2020

In [None]:
# find zero variance genes 
tmp = twenty_data['rnaseq']
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [None]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [None]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [20]:
# make the final new_rnaseq table
new_rnaseq1 = twenty_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twenty_data['rnaseq'].loc[:, keep_genes]
new_rnaseq = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)
twenty_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### RNA-seq for 2021

In [21]:
# find zero variance genes 
tmp = twentyone_data['rnaseq']
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [22]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [23]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [24]:
# make the final new_rnaseq table
new_rnaseq1 = twentyone_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twentyone_data['rnaseq'].loc[:, keep_genes]
twentyone_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### Harmonize the RNA-seq datasets 

In [25]:
# get the shared genes between 2020 and 2021
twenty_genes = set(twenty_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
twentyone_genes = set(twentyone_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
shared_genes = twenty_genes.intersection(twentyone_genes)

In [26]:
# getting harmonized for 2020 
new_rnaseq1 = twenty_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twenty_data['rnaseq'].loc[:, shared_genes]
twenty_data['rnaseq_harmonized'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

# getting harmonized for 2021
new_rnaseq1 = twentyone_data['rnaseq'].loc[:, ~twentyone_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twentyone_data['rnaseq'].loc[:, shared_genes]
twentyone_data['rnaseq_harmonized'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

In [27]:
# save harmonize rnaseq
harmony_dir = os.path.join(outdir, 'harmonized/')
os.makedirs(harmony_dir, exist_ok=True)

rnaseq_2020_fn = os.path.join(harmony_dir, 'rnaseq.2020.day0.pivoted.tsv.gz')
twenty_data['rnaseq_std'].to_csv(rnaseq_fn, sep='\t')

In [28]:
rnaseq_2021_fn = os.path.join(harmony_dir, 'rnaseq.2021.day0.pivoted.tsv.gz')
twenty_data['rnaseq_std'].to_csv(rnaseq_fn, sep='\t')

## Standardize all other baseline datasets

As of right now, we will use the pivoted data as is (interm). I will softlink the intermediate files over to the harmonized directory.

In [38]:
# SKIPPING AB TITERS FOR NOW
# # save abtiters 2020
# old_abtiters_fn = os.path.abspath(os.path.join(interm_dir, 'abtiters.2020.day0.pivoted.tsv.gz'))
# new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2020.day0.pivoted.tsv.gz')
# if not os.path.islink(new_abtiters_fn):
#     os.symlink(old_abtiters_fn, new_abtiters_fn)

# # save abtiters 2021
# old_abtiters_fn = os.path.abspath(os.path.join(interm_dir, 'abtiters.2021.day0.pivoted.tsv.gz'))
# new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2021.day0.pivoted.tsv.gz')
# if not os.path.islink(new_abtiters_fn):
#     os.symlink(old_abtiters_fn, new_abtiters_fn)

In [36]:
# save cytof 2020
old_cytof_fn = os.path.abspath(os.path.join(interm_dir, 'cytof.2020.day0.pivoted.tsv.gz'))
new_cytof_fn = os.path.join(harmony_dir, 'cytof.2020.day0.pivoted.tsv.gz')
if not os.path.islink(new_cytof_fn):
    os.symlink(old_cytof_fn, new_cytof_fn)

# save cytof 2021
old_cytof_fn = os.path.abspath(os.path.join(interm_dir, 'cytof.2021.day0.pivoted.tsv.gz'))
new_cytof_fn = os.path.join(harmony_dir, 'cytof.2021.day0.pivoted.tsv.gz')
if not os.path.islink(new_cytof_fn):
    os.symlink(old_cytof_fn, new_cytof_fn)

In [37]:
# save olink 2020
old_olink_fn = os.path.abspath(os.path.join(interm_dir, 'olink.2020.day0.pivoted.tsv.gz'))
new_olink_fn = os.path.join(harmony_dir, 'olink.2020.day0.pivoted.tsv.gz')
if not os.path.islink(new_olink_fn):
    os.symlink(old_olink_fn, new_olink_fn)

# save olink 2021
old_olink_fn = os.path.abspath(os.path.join(interm_dir, 'olink.2021.day0.pivoted.tsv.gz'))
new_olink_fn = os.path.join(harmony_dir, 'olink.2021.day0.pivoted.tsv.gz')
if not os.path.islink(new_olink_fn):
    os.symlink(old_olink_fn, new_olink_fn)