In [1]:
import os 
import pandas as pd 
import numpy as np 
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')
outdir = 'results/main/cmi_pb_datasets/processed/'
os.makedirs(outdir, exist_ok=True)

In [2]:
assays = ('abtiters', 'cytof', 'olink', 'rnaseq')
longnames = ('ab_titer', 'live_cell_percentages', 'olink_prot_exp', 'rnaseq')

In [3]:
# loading gencode data
gencode = pd.read_table('results/refs/gencode/gencode.v38lift37.annotation.protein_coding.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']
gencode['gene_nonversioned_id'] = gencode['gene_id'].str.replace('\..*', '')

In [4]:
interm_dir = os.path.join(outdir, 'full')

## Load the 2020 data

In [5]:
twenty_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twenty_data[metatable] = df
    
subjects = twenty_data['subject']
specimen = twenty_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]

# defining the meta columns (used to remove columns later one)
meta_cols = ['specimen_id', 'infancy_vac', 'biological_sex',
             'year_of_birth', 'date_of_boost', 'actual_day_relative_to_boost',
             'ethnicity', 'race', 'dataset',
             'specimen_type', 'visit']

twenty_data['master_meta'] = master_meta

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2020.pivoted.tsv.gz')
twenty_data['master_meta'].to_csv(meta_fn, sep='\t')

In [6]:
for i, longname in enumerate(longnames): 
    fn = 'results/main/cmi_pb_datasets/raw/2020LD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    print(fn)
        
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        # Still getting duplicate specimens? 
        df = df[df['unit'] == "Normalized Protein eXpression"]
        df = df[df['quality_control'] == 'Pass']
        df = df[df['protein_expression'] > df['lower_limit_of_quantitation']]
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')      
        
    elif assays[i] == 'abtiters':
        aglist = ['1% PFA PT', 'PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['antigen'].replace(to_replace='1% PFA PT', value='PT', inplace=True)
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values='MFI_normalised')
        df = master_meta.merge(df, on='specimen_id')  

    twenty_data[assays[i]] = df.drop(meta_cols, axis=1)    

results/main/cmi_pb_datasets/raw/2020LD_ab_titer.csv
results/main/cmi_pb_datasets/raw/2020LD_live_cell_percentages.csv
results/main/cmi_pb_datasets/raw/2020LD_olink_prot_exp.csv
results/main/cmi_pb_datasets/raw/2020LD_rnaseq.csv


In [7]:
for assay, df in twenty_data.items():
    print(assay, df.subject_id.nunique())

subject 60
specimen 60
master_meta 60
abtiters 58
cytof 20
olink 18
rnaseq 36


In [8]:
# save a dataframe for each assay on each day
for assay, df in twenty_data.items():
    if assay in ['cytof', 'olink', 'rnaseq', 'abtiters']:
        print(assay)
        for day, day_df in df.groupby('planned_day_relative_to_boost'):
            outfn = os.path.join(interm_dir, '{}.2020.day{}.pivoted.tsv.gz'.format(assay, day))
            day_df.drop('planned_day_relative_to_boost', axis=1).to_csv(outfn, index=False, sep='\t')
            print(day, day_df.shape)

abtiters
0 (58, 25)
1 (57, 25)
3 (57, 25)
7 (57, 25)
14 (57, 25)
30 (54, 25)
90 (51, 25)
386 (1, 25)
402 (1, 25)
428 (1, 25)
cytof
0 (20, 27)
1 (20, 27)
3 (20, 27)
7 (20, 27)
14 (20, 27)
olink
0 (18, 258)
1 (18, 258)
3 (18, 258)
7 (18, 258)
14 (18, 258)
rnaseq
0 (36, 58349)
1 (36, 58349)
3 (36, 58349)
7 (36, 58349)
14 (36, 58349)


# Load the 2021 data

In [9]:
twentyone_data = {}

# get meta master table
for metatable in ('subject', 'specimen'): 
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(metatable)
    df = pd.read_table(fn, sep=',')
    twentyone_data[metatable] = df
    
subjects = twentyone_data['subject']
specimen = twentyone_data['specimen']
master_meta = subjects.merge(specimen, on='subject_id')
master_meta = master_meta[['subject_id',
                           'specimen_id',
                           'infancy_vac',
                           'biological_sex',
                           'year_of_birth',
                           'date_of_boost',
                           'actual_day_relative_to_boost',
                           'planned_day_relative_to_boost',
                           'ethnicity',
                           'race',
                           'dataset',
                           'specimen_type',
                           'visit']]
twentyone_data['master_meta'] = master_meta

In [10]:
for i, longname in enumerate(longnames): 
        
    fn = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)
    df = pd.read_table(fn, sep=',')
    
    if assays[i] == 'rnaseq':
        df.loc[:, 'ensembl_gene_id'] = df.loc[:, 'versioned_ensembl_gene_id'].str.replace('\.[0-9]+', '')
        df = df.pivot(index='specimen_id', columns='ensembl_gene_id', values='tpm')
        df = master_meta.merge(df, on='specimen_id')
                
    elif assays[i] == 'cytof':
        df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
        df = master_meta.merge(df, on='specimen_id')
        
    elif assays[i] == 'olink':
        df = df.pivot(index='specimen_id', columns='uniprot_id', values='protein_expression')
        df = master_meta.merge(df, on='specimen_id')       
        
    elif assays[i] == 'abtiters':
        aglist = ['PT', 'PRN', 'FHA', 'FIM2/3']
        df = df.loc[df.antigen.isin(aglist), :]
        df['isotype_antigen'] = df['isotype'] + '-' + df['antigen']
        df = df.pivot(index='specimen_id', columns='isotype_antigen', values='MFI_normalised')
        df = master_meta.merge(df, on='specimen_id')  
          
    twentyone_data[assays[i]] = df.drop(meta_cols, axis=1)
    

In [11]:
longname = 'live_cell_percentages'
df = 'results/main/cmi_pb_datasets/raw/2021BD_{}.csv'.format(longname)

df = pd.read_table(df, sep=',')

df = df.pivot(index='specimen_id', columns='cell_type_name', values='percent_live_cell')
df = master_meta.merge(df, on='specimen_id')

In [12]:
for assay, df in twentyone_data.items():
    print(assay, df.subject_id.nunique())

subject 36
specimen 36
master_meta 36
abtiters 33
cytof 33
olink 36
rnaseq 36


In [13]:
os.makedirs(interm_dir, exist_ok=True)

# save abtiters  
abtiters_fn = os.path.join(interm_dir, 'abtiters.2021.day0.pivoted.tsv.gz')
twentyone_data['abtiters'].drop('planned_day_relative_to_boost', axis=1).to_csv(abtiters_fn, index=False, sep='\t')

# save cytof 
cytof_fn = os.path.join(interm_dir, 'cytof.2021.day0.pivoted.tsv.gz')
twentyone_data['cytof'].drop('planned_day_relative_to_boost', axis=1).to_csv(cytof_fn, index=False, sep='\t')

# save olink
olink_fn = os.path.join(interm_dir, 'olink.2021.day0.pivoted.tsv.gz')
twentyone_data['olink'].drop('planned_day_relative_to_boost', axis=1).to_csv(olink_fn, index=False, sep='\t')

# save rnaseq
rnaseq_fn = os.path.join(interm_dir, 'rnaseq.2021.day0.pivoted.tsv.gz')
twentyone_data['rnaseq'].drop('planned_day_relative_to_boost', axis=1).to_csv(rnaseq_fn, index=False, sep='\t')

# save meta
meta_fn = os.path.join(interm_dir, 'meta.2021.pivoted.tsv.gz')
twentyone_data['master_meta'].to_csv(meta_fn, index=False, sep='\t')

In [14]:
# summary21 = []
# for assay, df in twentyone_data.items():
#     summary21.append([assay, df.subject_id.nunique()])

# summary20 = []
# for assay, df in twenty_data.items():
#     summary20.append([df.subject_id.nunique()])

# summary = pd.concat([pd.DataFrame(summary21), pd.DataFrame(summary20)], axis=1)
# summary.columns = ('table', 'nsamples21', 'nsamples20')
# summary = summary[['table', 'nsamples20', 'nsamples21']]

In [15]:
# fn = 'results/main/cmi_pb_datasets/2020LD_{}.csv'.format('olink_prot_exp')
# df = pd.read_table(fn, sep=',')

## Standardize RNA-seq

The RNA-seq dataset includes a lot more variables and processing so there are many steps we are including for standardization:

    1) remove zero variance genes,
    2) remove mitochondrial genes, 
    3) keep expressed genes (defined as those with TPM > 1) in more than a certain proportion of people (cut_filter)
    4) intersect genes across 2020 and 2021 and finally make harmonized 2020 and 2021 tables 

In [16]:
# load gene data to find mitochondrial genes
mito_genes = gencode.loc[(gencode.chr == 'chrM'), 'gene_nonversioned_id']
mito_genes = set(mito_genes.unique().tolist())

In [17]:
# value used across both RNA-seq datasets
cut_filter = 0.3

### RNA-seq for 2020

In [18]:
# find zero variance genes 
tmp = twenty_data['rnaseq'].loc[twenty_data['rnaseq'].planned_day_relative_to_boost == 0]
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [19]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [20]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [21]:
# make the final new_rnaseq table
new_rnaseq1 = tmp.loc[:, ~tmp.columns.str.match('ENSG')]
new_rnaseq2 = tmp.loc[:, keep_genes]
twenty_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### RNA-seq for 2021

In [22]:
# find zero variance genes 
tmp = twentyone_data['rnaseq'].loc[twentyone_data['rnaseq'].planned_day_relative_to_boost == 0]
tmp = tmp.loc[:, tmp.columns.str.match('ENSG')]
tmpvars = tmp.var()
zero_var_genes = tmpvars[(tmpvars == 0)]
zero_var_genes = set(zero_var_genes.index.tolist())

In [23]:
# locate genes which are expressed
# defined as genes with TPM > 1 in more than 30% of samples 
expressed_genes = (tmp > 1).sum() > cut_filter * tmp.shape[0]
expressed_genes = set(expressed_genes.index[expressed_genes.values.tolist()])

In [24]:
# get the final list of genes 
keep_genes = expressed_genes.difference(mito_genes).difference(zero_var_genes)
keep_genes = keep_genes.intersection(gencode.gene_nonversioned_id.values.tolist())

In [25]:
# make the final new_rnaseq table
new_rnaseq1 = tmp.loc[:, ~tmp.columns.str.match('ENSG')]
new_rnaseq2 = tmp.loc[:, keep_genes]
twentyone_data['rnaseq_std'] = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)

### Harmonize the RNA-seq datasets 

In [26]:
# get the shared genes between 2020 and 2021
twenty_genes = set(twenty_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
twentyone_genes = set(twentyone_data['rnaseq_std'].columns.str.extract('(ENSG[0-9]+)').dropna()[0])
shared_genes = sorted(twenty_genes.intersection(twentyone_genes))

In [27]:
# getting harmonized for 2020 
new_rnaseq1 = twenty_data['rnaseq'].loc[:, ~twenty_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twenty_data['rnaseq'].loc[:, shared_genes]
harmonized_df = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)
harmonized_df = harmonized_df.loc[harmonized_df.planned_day_relative_to_boost == 0]
harmonized_df.drop('planned_day_relative_to_boost', axis=1, inplace=True)
twenty_data['rnaseq_harmonized'] = harmonized_df

In [28]:
# getting harmonized for 2021
new_rnaseq1 = twentyone_data['rnaseq'].loc[:, ~twentyone_data['rnaseq'].columns.str.match('ENSG')]
new_rnaseq2 = twentyone_data['rnaseq'].loc[:, shared_genes]
harmonized_df = pd.concat([new_rnaseq1, new_rnaseq2], axis=1)
harmonized_df = harmonized_df.loc[harmonized_df.planned_day_relative_to_boost == 0]
harmonized_df.drop('planned_day_relative_to_boost', axis=1, inplace=True)
twentyone_data['rnaseq_harmonized'] = harmonized_df

In [29]:
# save harmonize rnaseq
harmony_dir = os.path.join(outdir, 'harmonized/')
os.makedirs(harmony_dir, exist_ok=True)

rnaseq_2020_fn = os.path.join(harmony_dir, 'rnaseq.2020.day0.pivoted.tsv.gz')
twenty_data['rnaseq_harmonized'].loc[:, ['subject_id'] + shared_genes].to_csv(rnaseq_2020_fn, index=False, sep='\t')

rnaseq_2021_fn = os.path.join(harmony_dir, 'rnaseq.2021.day0.pivoted.tsv.gz')
twentyone_data['rnaseq_harmonized'].loc[:, ['subject_id'] + shared_genes].to_csv(rnaseq_2021_fn, index=False, sep='\t')

In [30]:
twentyone_data['rnaseq_harmonized'].shape

(36, 11589)

## Standardize CyTOF

In [31]:
shared_cells = set(twenty_data['cytof'].columns).intersection(set(twentyone_data['cytof'].columns))
shared_cells = sorted(list(shared_cells))
shared_cells.remove('subject_id')
shared_cells.remove('planned_day_relative_to_boost')

In [32]:
day0_rows = twenty_data['cytof']['planned_day_relative_to_boost'] == 0
twenty_data['cytof_harmonized'] = twenty_data['cytof'].loc[day0_rows]
twenty_data['cytof_harmonized'] = twenty_data['cytof_harmonized'].loc[:, ['subject_id'] + shared_cells]

day0_rows = twentyone_data['cytof']['planned_day_relative_to_boost'] == 0
twentyone_data['cytof_harmonized'] = twentyone_data['cytof'].loc[day0_rows]
twentyone_data['cytof_harmonized'] = twentyone_data['cytof_harmonized'].loc[:, ['subject_id'] + shared_cells]

In [33]:
cytof_2020_fn = os.path.join(harmony_dir, 'cytof.2020.day0.pivoted.tsv.gz')
twenty_data['cytof_harmonized'].to_csv(cytof_2020_fn, index=False, sep='\t')

In [34]:
cytof_2021_fn = os.path.join(harmony_dir, 'cytof.2021.day0.pivoted.tsv.gz')
twentyone_data['cytof_harmonized'].to_csv(cytof_2021_fn, index=False, sep='\t')

In [35]:
twentyone_data['cytof_harmonized'].shape

(33, 23)

## Standardize Ab Titers

In [36]:
abtiters_overlap = set(twenty_data['abtiters'].columns).intersection(twentyone_data['abtiters'].columns.tolist())
abtiters_overlap.discard('planned_day_relative_to_boost')
abtiters_overlap.discard('subject_id')
abtiters_overlap = sorted(abtiters_overlap)

In [37]:
twenty_data['abtiters_harmonized'] = twenty_data['abtiters'].\
        loc[twenty_data['abtiters'].planned_day_relative_to_boost == 0, ]
twenty_data['abtiters_harmonized'] = twenty_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2020.day0.pivoted.tsv.gz')
twenty_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

In [38]:
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters'].\
        loc[twentyone_data['abtiters'].planned_day_relative_to_boost == 0, ]
twentyone_data['abtiters_harmonized'] = twentyone_data['abtiters_harmonized'].loc[:, ['subject_id'] + abtiters_overlap]
new_abtiters_fn = os.path.join(harmony_dir, 'abtiters.2021.day0.pivoted.tsv.gz')
twentyone_data['abtiters_harmonized'].to_csv(new_abtiters_fn, sep='\t', index=False)

## Standardize Olink

### Checking Olink features 

In [39]:
def load_api_data(url):
    """
    Loading data using the API.
    """

    import urllib.request, json, ssl 

    gcontext = ssl.SSLContext()  
    with urllib.request.urlopen(url, context=gcontext) as url:
        data = url.read()
        data = pd.read_json(data)
        return(data)

In [40]:
set(twenty_data['olink'].columns.tolist()).intersection(twentyone_data['olink'].columns.tolist())

{'O14625',
 'O43508',
 'P01133',
 'P01579',
 'P02778',
 'P05112',
 'P05231',
 'P09603',
 'P10145',
 'P10147',
 'P13232',
 'P13236',
 'P13500',
 'P14210',
 'P15692',
 'P22301',
 'P35225',
 'P39900',
 'P48061',
 'P50591',
 'P51671',
 'P80075',
 'P80098',
 'Q07325',
 'Q14116',
 'Q99616',
 'Q99731',
 'planned_day_relative_to_boost',
 'subject_id'}

In [41]:
# load olink mapper
url = 'https://www.cmi-pb.org:443/api/v2/olink_prot_info'
olink_meta = load_api_data(url) 
olink_meta.set_index('olink_id', inplace=True)

# keep non-duplicated values only
olink_uniq = olink_meta.loc[~olink_meta.index.duplicated(keep=False), ]
olink_uniq = olink_uniq.squeeze().to_dict()

# keep only duplicated values
olink_dups = olink_meta.loc[olink_meta.index.duplicated(keep=False), ]
#olink_uniq = olink_uniq.squeeze().to_dict()

### Checking Olink features 

In [42]:
check_olink_2020 = twenty_data['olink'].copy()
check_olink_2020.rename(columns=olink_uniq, inplace=True)

In [43]:
check_olink_2021 = twentyone_data['olink'].copy()
check_olink_2021.rename(columns=olink_uniq, inplace=True)

In [44]:
olink_overlaps = set(check_olink_2020.columns.tolist()).intersection(check_olink_2021.columns.tolist())
olink_overlaps.discard('planned_day_relative_to_boost')
olink_overlaps.discard('subject_id')
olink_overlaps = sorted(olink_overlaps)

In [45]:
print('There are {} overlaps with the unique Uniprot proteins.'.format(len(olink_overlaps)))

There are 27 overlaps with the unique Uniprot proteins.


In [46]:
dup_overlaps = set(olink_overlaps).intersection(olink_dups.uniprot_id.tolist())
print('There are {} overlaps with the duplicates.'.format(len(dup_overlaps)))

There are 0 overlaps with the duplicates.


In [47]:
s = 'Olink data from 2020 has {} protein measurements'.format(twenty_data['olink'].shape[1] - 2)
s += ' whereas 2021 has {}.'.format(twentyone_data['olink'].shape[1] - 2)
print(s)

Olink data from 2020 has 256 protein measurements whereas 2021 has 45.


Seems like the olink to protein names database is not updated. I can't find the protein names for 2021 data. 

### Harmonizing features

In [48]:
twenty_olink_harmonized = check_olink_2020.copy()
twenty_olink_harmonized = twenty_olink_harmonized.loc[twenty_olink_harmonized.planned_day_relative_to_boost == 0]
twenty_olink_harmonized = twenty_olink_harmonized[['subject_id'] + olink_overlaps]
twenty_harmonized_olink_fn = os.path.join(harmony_dir, 'olink.2020.day0.pivoted.tsv.gz')
twenty_olink_harmonized.to_csv(twenty_harmonized_olink_fn, sep='\t', index=False)
twenty_data['olink_harmonized'] = twenty_olink_harmonized

In [49]:
twentyone_olink_harmonized = check_olink_2021.copy()
twentyone_olink_harmonized = twentyone_olink_harmonized.loc[twentyone_olink_harmonized.planned_day_relative_to_boost == 0]
twentyone_olink_harmonized = twentyone_olink_harmonized[['subject_id'] + olink_overlaps]
twentyone_harmonized_olink_fn = os.path.join(harmony_dir, 'olink.2021.day0.pivoted.tsv.gz')
twentyone_olink_harmonized.to_csv(twentyone_harmonized_olink_fn, sep='\t', index=False)
twentyone_data['olink_harmonized'] = twentyone_olink_harmonized

## Providing all task vectors for training

In [50]:
from IPython.display import display

In [51]:
# Loading the table describing each task 
tasks = pd.read_table('results/refs/tasks.tsv')

# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.feature_names.tsv.gz')
master_tasks.to_csv(task_fn, sep='\t', index=False)

# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == task.day), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(task.day)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(task.day)]    
    
    # merging the data together
    if i == 1: 
        master_tasks = task_data
    else:
        master_tasks = master_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'task_matrix.common_names.tsv.gz')
master_tasks.to_csv(task_fn, sep='\t', index=False)

In [52]:
master_tasks.head()

Unnamed: 0,subject_id,IgG-PT_day14,IgG-FHA_day14,IgG-PRN_day14,Monocytes_day1,ASCs (Plasmablasts)_day7,CD4Tcells_day3,CCL3_day3,IL-6_day3,NFKBIA_day7,XIST_day14
0,1,12.51386,0.084437,8.147366,,,,46.41,3.814,651.685,146.247
1,3,7.041547,11.564209,8.076863,,,,26.204,2.646,575.733,121.084
2,4,5.745959,2.93768,6.853723,7.211965,1.77,8.700555,13.353,3.77,550.87,0.132
3,5,5.327203,0.084437,5.062398,,,,20.618,2.07,1098.404,0.066
4,6,8.856575,2.487012,0.457834,41.380502,3.5,41.706336,19.606,2.163,686.3,127.023


## Providing day 0 data for all task vectors (for fold change transformations)|

In [53]:
# Making a task vector using GENE IDS for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.feature_names.tsv.gz')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

In [54]:
# Making a task vector using GENE NAMES for RNA based tasks
i = 1
for idd, task in tasks.iterrows():
    
    # extracting the required columns
    cols = ['subject_id', task.fetchname]
    task_data = twenty_data[task.assay]
    task_data = task_data.loc[(task_data.planned_day_relative_to_boost == 0), cols]
    
    if task.assay == 'rnaseq':
        task_data.columns = ['subject_id', task.fullname + '_day' + str(0)]    
    else:
        task_data.columns = ['subject_id', task.fetchname + '_day' + str(0)]    
    
    # merging the data together
    if i == 1: 
        zero_day_tasks = task_data
    else:
        zero_day_tasks = zero_day_tasks.merge(task_data, on='subject_id', how='outer')
    i += 1 

# saving the task matrix 
task_fn = os.path.join(harmony_dir, 'zero_day_matrix.common_names.tsv.gz')
zero_day_tasks.to_csv(task_fn, sep='\t', index=False)

## Providing meta data into a basic format

In [55]:
twenty_basic_meta = twenty_data['master_meta'].drop_duplicates('subject_id')
drop_cols = ['specimen_id', 'actual_day_relative_to_boost', 'planned_day_relative_to_boost', 'visit', 'dataset']
twenty_basic_meta.drop(drop_cols, axis=1, inplace=True)

twenty_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2020.tsv.gz')
twenty_basic_meta.to_csv(twenty_basic_fn, sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [56]:
twentyone_basic_meta = twentyone_data['master_meta'].drop_duplicates('subject_id')
twentyone_basic_meta.drop(drop_cols, axis=1, inplace=True)

twentyone_basic_fn = os.path.join(harmony_dir, 'clinical_metadata.2021.tsv.gz')
twentyone_basic_meta.to_csv(twentyone_basic_fn, sep='\t', index=False)

# Summarize the harmonization process

In [57]:
summary = []
for assay in assays:
    
    harmonized_assay = '{}_harmonized'.format(assay)
    summary.append([assay, 
                    twenty_data[assay].shape[1],
                    twentyone_data[assay].shape[1], 
                    twentyone_data[harmonized_assay].shape[1]])

summary = pd.DataFrame(summary)
summary.columns = ['Assay', '2020', '2021', 'Harmonized']

In [58]:
summary.sort_values('Assay')

Unnamed: 0,Assay,2020,2021,Harmonized
0,abtiters,25,22,20
1,cytof,27,44,23
2,olink,258,47,28
3,rnaseq,58349,58349,11589
