In [None]:
import os 
import pandas as pd 
import numpy as np 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')
outdir = 'results/main/cmi_pb_datasets/processed/'
os.makedirs(outdir, exist_ok=True)

IgG1 and IgG4 day 14 and day 0 values for PT, FHA, and Pertactin

In [None]:
assays = ('abtiters', 'cytof', 'olink', 'rnaseq')
longnames = ('ab_titer', 'live_cell_percentages', 'olink_prot_exp', 'rnaseq')

In [None]:
# loading gencode data
gencode = pd.read_table('results/refs/gencode/gencode.v38lift37.annotation.protein_coding.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']
gencode['gene_nonversioned_id'] = gencode['gene_id'].str.replace('\..*', '')

In [None]:
interm_dir = os.path.join(outdir, 'full')

## Load the 2020 data

In [None]:
twenty_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twenty_data[metatable] = df
    
subjects = twenty_data['subject']
specimen = twenty_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]

# defining the meta columns (used to remove columns later one)
meta_cols = ['specimen_id', 'infancy_vac', 'biological_sex',
             'year_of_birth', 'date_of_boost', 'actual_day_relative_to_boost',
             'ethnicity', 'race', 'dataset',
             'specimen_type', 'visit']

twenty_data['master_meta'] = master_meta

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2020.pivoted.tsv')
twenty_data['master_meta'].to_csv(meta_fn, sep='\t')

In [None]:
# code to switch between different MFI's 
#curr_mfi, curr_mfi_name = ('MFI', 'mfi_raw')
curr_mfi, curr_mfi_name = ('MFI_normalised', 'mfi_normalised')

In [None]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    print(fn)
        
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        # Still getting duplicate specimens? 
        df = df[df['unit'] == "Normalized Protein eXpression"]
        df = df[df['quality_control'] == 'Pass']
        df = df[df['protein_expression'] > df['lower_limit_of_quantitation']]
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')      
        
    elif assays[i] == 'abtiters':
        aglist = ['1% PFA PT', 'PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['antigen'].replace(to_replace='1% PFA PT', value='PT', inplace=True)
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values=curr_mfi)
        df = master_meta.merge(df, on='specimen_id')  
        
    twenty_data[assays[i]] = df.drop(meta_cols, axis=1)    

In [None]:
for assay, df in twenty_data.items():
    print(assay, df.subject_id.nunique())

In [None]:
# save a dataframe for each assay on each day
for assay, df in twenty_data.items():
    if assay in ['cytof', 'olink', 'rnaseq', 'abtiters']:
        print(assay)
        for day, day_df in df.groupby('planned_day_relative_to_boost'):
            
            if assay == 'abtiters': 
                outfn = os.path.join(interm_dir, '{}.{}.2020.day{}.pivoted.tsv'.format(assay, curr_mfi_name, day))
            else:
                outfn = os.path.join(interm_dir, '{}.2020.day{}.pivoted.tsv'.format(assay, day))
            
            day_df.drop('planned_day_relative_to_boost', axis=1).to_csv(outfn, index=False, sep='\t')
            print(day, day_df.shape)

# Load the 2021 data

In [None]:
twentyone_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twentyone_data[metatable] = df
    
subjects = twentyone_data['subject']
specimen = twentyone_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twentyone_data['master_meta'] = master_meta

In [None]:
for i, longname in enumerate(longnames): 
        
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
                
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')       
        
    elif assays[i] == 'abtiters':
        aglist = ['PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values=curr_mfi)
        df = master_meta.merge(df, on='specimen_id')  
          
    twentyone_data[assays[i]] = df.drop(meta_cols, axis=1)
    

In [None]:
longname = 'live_cell_percentages'
df = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)

df = pd.read_table(df, sep=',')

df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
df = master_meta.merge(df, on='specimen_id')

In [None]:
for assay, df in twentyone_data.items():
    print(assay, df.subject_id.nunique())

In [None]:
os.makedirs(interm_dir, exist_ok=True)

# save abtiters  
abtiters_fn = os.path.join(interm_dir, 'abtiters.{}.2021.day0.pivoted.tsv'.format(curr_mfi_name))
twentyone_data['abtiters'].drop('planned_day_relative_to_boost', axis=1).to_csv(abtiters_fn, index=False, sep='\t')

# save cytof 
cytof_fn = os.path.join(interm_dir, 'cytof.2021.day0.pivoted.tsv')
twentyone_data['cytof'].drop('planned_day_relative_to_boost', axis=1).to_csv(cytof_fn, index=False, sep='\t')

# save olink
olink_fn = os.path.join(interm_dir, 'olink.2021.day0.pivoted.tsv')
twentyone_data['olink'].drop('planned_day_relative_to_boost', axis=1).to_csv(olink_fn, index=False, sep='\t')

# save rnaseq
rnaseq_fn = os.path.join(interm_dir, 'rnaseq.2021.day0.pivoted.tsv')
twentyone_data['rnaseq'].drop('planned_day_relative_to_boost', axis=1).to_csv(rnaseq_fn, index=False, sep='\t')

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2021.pivoted.tsv')
twentyone_data['master_meta'].to_csv(meta_fn, index=False, sep='\t')

In [None]:
# summary21 = []
# for assay, df in twentyone_data.items():
#     summary21.append([assay, df.subject_id.nunique()])

# summary20 = []
# for assay, df in twenty_data.items():
#     summary20.append([df.subject_id.nunique()])

# summary = pd.concat([pd.DataFrame(summary21), pd.DataFrame(summary20)], axis=1)
# summary.columns = ('table', 'nsamples21', 'nsamples20')
# summary = summary[['table', 'nsamples20', 'nsamples21']]

In [None]:
# fn = 'results/main/cmi_pb_datasets/2020LD_{}.csv'.format('olink_prot_exp')
# df = pd.read_table(fn, sep=',')

## Standardize RNA-seq

The RNA-seq dataset includes a lot more variables and processing so there are many steps we are including for standardization:

    1) remove zero variance genes,
    2) remove mitochondrial genes, 
    3) keep expressed genes (defined as those with TPM > 1) in more than a certain proportion of people (cut_filter)
    4) intersect genes across 2020 and 2021 and finally make harmonized 2020 and 2021 tables 

In [None]:
# load gene data to find mitochondrial genes
mito_genes = gencode.loc[(gencode.chr == 'chrM'), 'gene_nonversioned_id']
mito_genes = set(mito_genes.unique().tolist())

In [None]:
# value used across both RNA-seq datasets
cut_filter = 0.3

### RNA-seq for 2020

In [None]:
# find zero variance genes 
tmp = twenty_data['rnaseq'].loc[twenty_data['rnaseq'].planned_day_relative_to_boost == 0]
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [None]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [None]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [None]:
# make the final new_rnaseq table
new_rnaseq1 = tmp.loc[:, ~tmp.columns.str.match('ENSG')]
new_rnaseq2 = tmp.loc[:, keep_genes]
twenty_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### RNA-seq for 2021

In [None]:
# find zero variance genes 
tmp = twentyone_data['rnaseq'].loc[twentyone_data['rnaseq'].planned_day_relative_to_boost == 0]
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [None]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [None]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [None]:
# make the final new_rnaseq table
new_rnaseq1 = tmp.loc[:, ~tmp.columns.str.match('ENSG')]
new_rnaseq2 = tmp.loc[:, keep_genes]
twentyone_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### Harmonize the RNA-seq datasets 

In [None]:
# get the shared genes between 2020 and 2021
twenty_genes = set(twenty_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
twentyone_genes = set(twentyone_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
shared_genes = sorted(twenty_genes.intersection(twentyone_genes))

In [None]:
# getting harmonized for 2020 
new_rnaseq1 = twenty_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twenty_data['rnaseq'].loc[:, shared_genes]
harmonized_df = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)
harmonized_df = harmonized_df.loc[harmonized_df.planned_day_relative_to_boost == 0]
harmonized_df.drop('planned_day_relative_to_boost', axis=1, inplace=True)
twenty_data['rnaseq_harmonized'] = harmonized_df

In [None]:
# getting harmonized for 2021
new_rnaseq1 = twentyone_data['rnaseq'].loc[:, ~twentyone_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twentyone_data['rnaseq'].loc[:, shared_genes]
harmonized_df = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)
harmonized_df = harmonized_df.loc[harmonized_df.planned_day_relative_to_boost == 0]
harmonized_df.drop('planned_day_relative_to_boost', axis=1, inplace=True)
twentyone_data['rnaseq_harmonized'] = harmonized_df

In [None]:
# save harmonize rnaseq
harmony_dir = os.path.join(outdir, 'harmonized/')
os.makedirs(harmony_dir, exist_ok=True)

rnaseq_2020_fn = os.path.join(harmony_dir, 'rnaseq.2020.day0.pivoted.tsv')
twenty_data['rnaseq_harmonized'].loc[:, ['subject_id'] + shared_genes].to_csv(rnaseq_2020_fn, index=False, sep='\t')

rnaseq_2021_fn = os.path.join(harmony_dir, 'rnaseq.2021.day0.pivoted.tsv')
twentyone_data['rnaseq_harmonized'].loc[:, ['subject_id'] + shared_genes].to_csv(rnaseq_2021_fn, index=False, sep='\t')

## Standardize CyTOF

In [None]:
shared_cells = set(twenty_data['cytof'].columns).intersection(set(twentyone_data['cytof'].columns))
shared_cells = sorted(list(shared_cells))
shared_cells.remove('subject_id')
shared_cells.remove('planned_day_relative_to_boost')

In [None]:
day0_rows = twenty_data['cytof']['planned_day_relative_to_boost'] == 0
twenty_data['cytof_harmonized'] = twenty_data['cytof'].loc[day0_rows]
twenty_data['cytof_harmonized'] = twenty_data['cytof_harmonized'].loc[:, ['subject_id'] + shared_cells]

day0_rows = twentyone_data['cytof']['planned_day_relative_to_boost'] == 0
twentyone_data['cytof_harmonized'] = twentyone_data['cytof'].loc[day0_rows]
twentyone_data['cytof_harmonized'] = twentyone_data['cytof_harmonized'].loc[:, ['subject_id'] + shared_cells]

In [None]:
cytof_2020_fn = os.path.join(harmony_dir, 'cytof.2020.day0.pivoted.tsv')
twenty_data['cytof_harmonized'].to_csv(cytof_2020_fn, index=False, sep='\t')

In [None]:
cytof_2021_fn = os.path.join(harmony_dir, 'cytof.2021.day0.pivoted.tsv')
twentyone_data['cytof_harmonized'].to_csv(cytof_2021_fn, index=False, sep='\t')

In [None]:
twentyone_data['cytof_harmonized'].shape

## Standardize Ab Titers

In [None]:
abtiters_overlap = set(twenty_data['abtiters'].columns).intersection(twentyone_data['abtiters'].columns.tolist())
abtiters_overlap.discard('planned_day_relative_to_boost')
abtiters_overlap.discard('subject_id')
abtiters_overlap = sorted(abtiters_overlap)

In [None]:
twenty_data['abtiters_harmonized'] = twenty_data['abtiters'].\
        loc[twenty_data['abtiters'].planned_day_relative_to_boost == 0, ]
twenty_data['abtiters_harmonized'] = twenty_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.{}.2020.day0.pivoted.tsv'.format(curr_mfi_name))
twenty_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

In [None]:
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters'].\
        loc[twentyone_data['abtiters'].planned_day_relative_to_boost == 0, ]
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.{}.2021.day0.pivoted.tsv'.format(curr_mfi_name))
twentyone_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

## Standardize Olink

### Checking Olink features 

In [None]:
def load_api_data(url):
    """
    Loading data using the API.
    """

    import urllib.request, json, ssl 

    gcontext = ssl.SSLContext()  
    with urllib.request.urlopen(url, context=gcontext) as url:
        data = url.read()
        data = pd.read_json(data)
        return(data)

In [None]:
set(twenty_data['olink'].columns.tolist()).intersection(twentyone_data['olink'].columns.tolist())

In [None]:
# load olink mapper
url = 'https://www.cmi-pb.org:443/api/v2/olink_prot_info'
olink_meta = load_api_data(url) 
olink_meta.set_index('olink_id', inplace=True)

# keep non-duplicated values only
olink_uniq = olink_meta.loc[~olink_meta.index.duplicated(keep=False), ]
olink_uniq = olink_uniq.squeeze().to_dict()

# keep only duplicated values
olink_dups = olink_meta.loc[olink_meta.index.duplicated(keep=False), ]
#olink_uniq = olink_uniq.squeeze().to_dict()

### Checking Olink features 

In [None]:
check_olink_2020 = twenty_data['olink'].copy()
check_olink_2020.rename(columns=olink_uniq, inplace=True)

In [None]:
check_olink_2021 = twentyone_data['olink'].copy()
check_olink_2021.rename(columns=olink_uniq, inplace=True)

In [None]:
olink_overlaps = set(check_olink_2020.columns.tolist()).intersection(check_olink_2021.columns.tolist())
olink_overlaps.discard('planned_day_relative_to_boost')
olink_overlaps.discard('subject_id')
olink_overlaps = sorted(olink_overlaps)

In [None]:
print('There are {} overlaps with the unique Uniprot proteins.'.format(len(olink_overlaps)))

In [None]:
dup_overlaps = set(olink_overlaps).intersection(olink_dups.uniprot_id.tolist())
print('There are {} overlaps with the duplicates.'.format(len(dup_overlaps)))

In [None]:
s = 'Olink data from 2020 has {} protein measurements'.format(twenty_data['olink'].shape[1] - 2)
s += ' whereas 2021 has {}.'.format(twentyone_data['olink'].shape[1] - 2)
print(s)

Seems like the olink to protein names database is not updated. I can't find the protein names for 2021 data. 

### Harmonizing features

In [None]:
twenty_olink_harmonized = check_olink_2020.copy()
twenty_olink_harmonized = twenty_olink_harmonized.loc[twenty_olink_harmonized.planned_day_relative_to_boost == 0]
twenty_olink_harmonized = twenty_olink_harmonized[['subject_id'] + olink_overlaps]
twenty_harmonized_olink_fn = os.path.join(harmony_dir, 'olink.2020.day0.pivoted.tsv')
twenty_olink_harmonized.to_csv(twenty_harmonized_olink_fn, sep='\t', index=False)
twenty_data['olink_harmonized'] = twenty_olink_harmonized

In [None]:
twentyone_olink_harmonized = check_olink_2021.copy()
twentyone_olink_harmonized = twentyone_olink_harmonized.loc[twentyone_olink_harmonized.planned_day_relative_to_boost == 0]
twentyone_olink_harmonized = twentyone_olink_harmonized[['subject_id'] + olink_overlaps]
twentyone_harmonized_olink_fn = os.path.join(harmony_dir, 'olink.2021.day0.pivoted.tsv')
twentyone_olink_harmonized.to_csv(twentyone_harmonized_olink_fn, sep='\t', index=False)
twentyone_data['olink_harmonized'] = twentyone_olink_harmonized

## Providing all task vectors for training

In [None]:
from IPython.display import display

In [None]:
# Loading the table describing each task 
tasks = pd.read_table('results/refs/tasks.tsv')

# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.feature_names.tsv')
master_tasks.to_csv(task_fn, sep='\t', index=False)

# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(task.day)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.common_names.tsv')
master_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
twenty_data.keys()

In [None]:
master_tasks.head()

## Providing day 0 data for all task vectors (for fold change transformations)|

In [None]:
# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.feature_names.tsv')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(0)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.common_names.tsv')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

In [None]:
zero_day_tasks

## Providing meta data into a basic format

In [None]:
twenty_basic_meta = twenty_data['master_meta'].drop_duplicates('subject_id')
drop_cols = ['specimen_id', 'actual_day_relative_to_boost', 'planned_day_relative_to_boost', 'visit', 'dataset']
twenty_basic_meta.drop(drop_cols, axis=1, inplace=True)

twenty_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2020.tsv')
twenty_basic_meta.to_csv(twenty_basic_fn, sep='\t', index=False)

In [None]:
twentyone_basic_meta = twentyone_data['master_meta'].drop_duplicates('subject_id')
twentyone_basic_meta.drop(drop_cols, axis=1, inplace=True)

twentyone_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2021.tsv')
twentyone_basic_meta.to_csv(twentyone_basic_fn, sep='\t', index=False)

# Summarize the harmonization process

In [None]:
summary = []
for assay in assays:
    
    harmonized_assay = '{}_harmonized'.format(assay)
    summary.append([assay, 
                    twenty_data[assay].shape[1],
                    twentyone_data[assay].shape[1], 
                    twentyone_data[harmonized_assay].shape[1]])

summary = pd.DataFrame(summary)
summary.columns = ['Assay', '2020', '2021', 'Harmonized']

In [None]:
summary.sort_values('Assay')

In [123]:
7 + 7 

14

In [62]:
twentyone_data['rnaseq_harmonized'].loc[:, 'ENSG00000229807']

0      2.863
1      9.752
2      5.806
3      0.017
4      0.053
5      3.846
6      5.879
7      0.064
8      6.097
9      4.294
10     4.834
11     4.547
12     7.453
13    10.226
14     5.942
15     9.641
16     0.019
17     5.255
18     0.019
19     6.176
20     0.069
21     5.796
22    13.175
23     2.752
24    10.943
25     3.035
26     0.000
27     0.007
28     4.559
29     4.734
30     0.008
31     9.138
32     5.906
33     0.000
34     8.604
35     0.030
Name: ENSG00000229807, dtype: float64