In [1]:
import os 
import pandas as pd 
import numpy as np 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')
outdir = 'results/main/cmi_pb_datasets/processed/'
os.makedirs(outdir, exist_ok=True)

IgG1 and IgG4 day 14 and day 0 values for PT, FHA, and Pertactin

In [2]:
assays = ('abtiters', 'cytof', 'olink', 'rnaseq')
longnames = ('ab_titer', 'live_cell_percentages', 'olink_prot_exp', 'rnaseq')

In [3]:
# loading gencode data
gencode = pd.read_table('results/refs/gencode/gencode.v38lift37.annotation.protein_coding.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']
gencode['gene_nonversioned_id'] = gencode['gene_id'].str.replace('\..*', '')

In [4]:
interm_dir = os.path.join(outdir, 'full')

## Load the 2020 data

In [None]:
twenty_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twenty_data[metatable] = df
    
subjects = twenty_data['subject']
specimen = twenty_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]

# defining the meta columns (used to remove columns later one)
meta_cols = ['specimen_id', 'infancy_vac', 'biological_sex',
             'year_of_birth', 'date_of_boost', 'actual_day_relative_to_boost',
             'ethnicity', 'race', 'dataset',
             'specimen_type', 'visit']

twenty_data['master_meta'] = master_meta

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2020.pivoted.tsv')
twenty_data['master_meta'].to_csv(meta_fn, sep='\t')

In [None]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    print(fn)
        
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        # Still getting duplicate specimens? 
        df = df[df['unit'] == "Normalized Protein eXpression"]
        df = df[df['quality_control'] == 'Pass']
        df = df[df['protein_expression'] > df['lower_limit_of_quantitation']]
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')      
        
    elif assays[i] == 'abtiters':
        aglist = ['1% PFA PT', 'PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['antigen'].replace(to_replace='1% PFA PT', value='PT', inplace=True)
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values='MFI_normalised')
        df = master_meta.merge(df, on='specimen_id')  

    twenty_data[assays[i]] = df.drop(meta_cols, axis=1)    

results/main/cmi_pb_datasets/raw/2020LD_ab_titer.csv
results/main/cmi_pb_datasets/raw/2020LD_live_cell_percentages.csv
results/main/cmi_pb_datasets/raw/2020LD_olink_prot_exp.csv
results/main/cmi_pb_datasets/raw/2020LD_rnaseq.csv


In [None]:
for assay, df in twenty_data.items():
    print(assay, df.subject_id.nunique())

subject 60
specimen 60
master_meta 60
abtiters 58
cytof 20
olink 18
rnaseq 36


In [None]:
# save a dataframe for each assay on each day
for assay, df in twenty_data.items():
    if assay in ['cytof', 'olink', 'rnaseq', 'abtiters']:
        print(assay)
        for day, day_df in df.groupby('planned_day_relative_to_boost'):
            outfn = os.path.join(interm_dir, '{}.2020.day{}.pivoted.tsv'.format(assay, day))
            day_df.drop('planned_day_relative_to_boost', axis=1).to_csv(outfn, index=False, sep='\t')
            print(day, day_df.shape)

abtiters
0 (58, 25)
1 (57, 25)
3 (57, 25)
7 (57, 25)
14 (57, 25)
30 (54, 25)
90 (51, 25)
386 (1, 25)
402 (1, 25)
428 (1, 25)
cytof
0 (20, 27)
1 (20, 27)
3 (20, 27)
7 (20, 27)
14 (20, 27)
olink
0 (18, 258)
1 (18, 258)
3 (18, 258)
7 (18, 258)
14 (18, 258)
rnaseq
0 (36, 58349)
1 (36, 58349)
3 (36, 58349)


# Load the 2021 data

In [None]:
twentyone_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twentyone_data[metatable] = df
    
subjects = twentyone_data['subject']
specimen = twentyone_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twentyone_data['master_meta'] = master_meta

In [None]:
for i, longname in enumerate(longnames): 
        
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
                
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')       
        
    elif assays[i] == 'abtiters':
        aglist = ['PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values='MFI_normalised')
        df = master_meta.merge(df, on='specimen_id')  
          
    twentyone_data[assays[i]] = df.drop(meta_cols, axis=1)
    

In [None]:
longname = 'live_cell_percentages'
df = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)

df = pd.read_table(df, sep=',')

df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
df = master_meta.merge(df, on='specimen_id')

In [None]:
for assay, df in twentyone_data.items():
    print(assay, df.subject_id.nunique())

In [None]:
os.makedirs(interm_dir, exist_ok=True)

# save abtiters  
abtiters_fn = os.path.join(interm_dir, 'abtiters.2021.day0.pivoted.tsv')
twentyone_data['abtiters'].drop('planned_day_relative_to_boost', axis=1).to_csv(abtiters_fn, index=False, sep='\t')

# save cytof 
cytof_fn = os.path.join(interm_dir, 'cytof.2021.day0.pivoted.tsv')
twentyone_data['cytof'].drop('planned_day_relative_to_boost', axis=1).to_csv(cytof_fn, index=False, sep='\t')

# save olink
olink_fn = os.path.join(interm_dir, 'olink.2021.day0.pivoted.tsv')
twentyone_data['olink'].drop('planned_day_relative_to_boost', axis=1).to_csv(olink_fn, index=False, sep='\t')

# save rnaseq
rnaseq_fn = os.path.join(interm_dir, 'rnaseq.2021.day0.pivoted.tsv')
twentyone_data['rnaseq'].drop('planned_day_relative_to_boost', axis=1).to_csv(rnaseq_fn, index=False, sep='\t')

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2021.pivoted.tsv')
twentyone_data['master_meta'].to_csv(meta_fn, index=False, sep='\t')

In [None]:
# summary21 = []
# for assay, df in twentyone_data.items():
#     summary21.append([assay, df.subject_id.nunique()])

# summary20 = []
# for assay, df in twenty_data.items():
#     summary20.append([df.subject_id.nunique()])

# summary = pd.concat([pd.DataFrame(summary21), pd.DataFrame(summary20)], axis=1)
# summary.columns = ('table', 'nsamples21', 'nsamples20')
# summary = summary[['table', 'nsamples20', 'nsamples21']]

In [None]:
# fn = 'results/main/cmi_pb_datasets/2020LD_{}.csv'.format('olink_prot_exp')
# df = pd.read_table(fn, sep=',')

## Standardize Ab Titers

In [None]:
abtiters_overlap = set(twenty_data['abtiters'].columns).intersection(twentyone_data['abtiters'].columns.tolist())
abtiters_overlap.discard('planned_day_relative_to_boost')
abtiters_overlap.discard('subject_id')
abtiters_overlap = sorted(abtiters_overlap)

In [None]:
twenty_data['abtiters_harmonized'] = twenty_data['abtiters'].\
        loc[twenty_data['abtiters'].planned_day_relative_to_boost == 0, ]
twenty_data['abtiters_harmonized'] = twenty_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2020.day0.pivoted.tsv')
twenty_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

In [None]:
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters'].\
        loc[twentyone_data['abtiters'].planned_day_relative_to_boost == 0, ]
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2021.day0.pivoted.tsv')
twentyone_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

## Providing all task vectors for training

In [None]:
from IPython.display import display

In [None]:
# Loading the table describing each task 
tasks = pd.read_table('results/refs/tasks.tsv')

# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.feature_names.tsv')
master_tasks.to_csv(task_fn, sep='\t', index=False)

# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(task.day)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.common_names.tsv')
master_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
twenty_data.keys()

In [None]:
master_tasks.head()

## Providing day 0 data for all task vectors (for fold change transformations)|

In [None]:
# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.feature_names.tsv')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(0)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.common_names.tsv')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
zero_day_tasks

## Providing meta data into a basic format

In [None]:
twenty_basic_meta = twenty_data['master_meta'].drop_duplicates('subject_id')
drop_cols = ['specimen_id', 'actual_day_relative_to_boost', 'planned_day_relative_to_boost', 'visit', 'dataset']
twenty_basic_meta.drop(drop_cols, axis=1, inplace=True)

twenty_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2020.tsv')
twenty_basic_meta.to_csv(twenty_basic_fn, sep='\t', index=False)

In [None]:
twentyone_basic_meta = twentyone_data['master_meta'].drop_duplicates('subject_id')
twentyone_basic_meta.drop(drop_cols, axis=1, inplace=True)

twentyone_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2021.tsv')
twentyone_basic_meta.to_csv(twentyone_basic_fn, sep='\t', index=False)

# Summarize the harmonization process

In [None]:
summary = []
for assay in assays:
    
    harmonized_assay = '{}_harmonized'.format(assay)
    summary.append([assay, 
                    twenty_data[assay].shape[1],
                    twentyone_data[assay].shape[1], 
                    twentyone_data[harmonized_assay].shape[1]])

summary = pd.DataFrame(summary)
summary.columns = ['Assay', '2020', '2021', 'Harmonized']

In [None]:
summary.sort_values('Assay')