In [1]:
import os
import numpy as np
import pandas as pd 
import glob
from sklearn.linear_model import LinearRegression
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/cmi-pb-preds/')

# setting the data directory
data_dir = 'results/main/cmi_pb_datasets/processed/harmonized/'

# setting the output dir
output_dir = 'results/main/submissions/lr_model/'

## Loading the input data

In [2]:
# setting up dictionaries to load data and results
train_features = {}
train_outcomes = {}

test_features = {}
test_preds = {}

#### Training Features

In [3]:
# getting training files
train_files = glob.glob(os.path.join(data_dir, '*2020.day0*'))
train_files = [x for x in train_files if '2022-01-07' not in x]

In [4]:
# loading testing files
for fn in train_files:

    basename = os.path.basename(fn)
    basesplit = basename.split('.')
    assay = basesplit[0]
    
    train_features[assay] = pd.read_table(fn).dropna()

#### Training Outcomes

In [5]:
fn = os.path.join(data_dir, 'task_matrix.common_names.tsv.gz')
train_outcomes = pd.read_table(fn)

#### Testing Features

In [7]:
# getting testing files
test_files = glob.glob(os.path.join(data_dir, '*2021.day0*'))
test_files = [x for x in test_files if '2022-01-07' not in x]

In [8]:
# loading testing files
for fn in test_files:

    basename = os.path.basename(fn)
    basesplit = basename.split('.')
    assay = basesplit[0]
    
    test_features[assay] = pd.read_table(fn).dropna()

## Building Lists of Tasks Cased on Assay Type

In [9]:
abtiter_tasks = ['IgG-PT_day14', 'IgG-FHA_day14', 'IgG-PRN_day14']
cytof_tasks = ['Monocytes_day1', 'ASCs (Plasmablasts)_day7', 'CD4Tcells_day3']
rnaseq_tasks = ['CCL3_day3', 'IL-6_day3', 'NFKBIA_day7', 'XIST_day14']

## Make predictions for the Ab Titers

In [10]:
ctrain_features = train_features['abtiters']
ctest_features = test_features['abtiters']

In [11]:
for task in abtiter_tasks:
    
    print(task)

    # get the outcome vector 
    ctrain_outcome = train_outcomes[['subject_id', task]]
    
    # get the shared subjects 
    shared_subjects = set(ctrain_features.subject_id.tolist()).intersection(ctrain_outcome.subject_id)
 
    # extract the shared subjects  
    xdata = ctrain_features.loc[ctrain_features.subject_id.isin(shared_subjects)]
    ydata = ctrain_outcome.loc[ctrain_outcome.subject_id.isin(shared_subjects)]   
        
    # building the model
    lr_model = LinearRegression()
    lr_model.fit(xdata.drop('subject_id', axis=1).values, ydata.iloc[:, 1].values)
    preds = lr_model.predict(ctest_features.drop('subject_id', axis=1).values)
    
    # create the ranks df
    ranks = [ctest_features.subject_id.tolist(), np.argsort(preds)]
    ranks = list(zip(*ranks))
    ranks = pd.DataFrame(ranks, columns=['subject_id', 'rank'])
    test_preds[task] = ranks


IgG-PT_day14
IgG-FHA_day14
IgG-PRN_day14


## Make predictions for the CyTOF

In [12]:
ctrain_features = train_features['cytof']
ctest_features = test_features['cytof']

In [13]:
for task in cytof_tasks:
    
    print(task)

    # get the outcome vector 
    ctrain_outcome = train_outcomes[['subject_id', task]].dropna(subset=[task])
    
    # get the shared subjects 
    shared_subjects = set(ctrain_features.subject_id.tolist()).intersection(ctrain_outcome.subject_id)
 
    # extract the shared subjects  
    xdata = ctrain_features.loc[ctrain_features.subject_id.isin(shared_subjects)]
    ydata = ctrain_outcome.loc[ctrain_outcome.subject_id.isin(shared_subjects)]   
        
    # building the model
    lr_model = LinearRegression()
    lr_model.fit(xdata.drop('subject_id', axis=1).values, ydata.iloc[:, 1].values)
    preds = lr_model.predict(ctest_features.drop('subject_id', axis=1).values)
    
    # create the ranks df
    ranks = [ctest_features.subject_id.tolist(), np.argsort(preds)]
    ranks = list(zip(*ranks))
    ranks = pd.DataFrame(ranks, columns=['subject_id', 'rank'])
    test_preds[task] = ranks


Monocytes_day1
ASCs (Plasmablasts)_day7
CD4Tcells_day3


## Make predictions for the RNA-seq based tasks

In [16]:
ctrain_features = train_features['rnaseq']
ctest_features = test_features['rnaseq']

In [17]:
for task in rnaseq_tasks:
    
    print(task)

    ctrain_outcome = train_outcomes[['subject_id', task]]
    ctrain_outcome = ctrain_outcome.loc[ctrain_outcome.subject_id.isin(ctrain_features.subject_id.tolist())]
        
    # building the model
    lr_model = LinearRegression()
    lr_model.fit(ctrain_features.drop('subject_id', axis=1).values, ctrain_outcome.iloc[:, 1].values)
    preds = lr_model.predict(ctest_features.drop('subject_id', axis=1).values)
    
    # create the ranks df
    ranks = [ctest_features.subject_id.tolist(), np.argsort(preds)]
    ranks = list(zip(*ranks))
    ranks = pd.DataFrame(ranks, columns=['subject_id', 'rank'])
    test_preds[task] = ranks


CCL3_day3
IL-6_day3
NFKBIA_day7
XIST_day14


## Save predictions to the Excel File

In [18]:
form_fn = 'results/main/submissions/forms/2020_submission_format.tsv'
form = pd.read_table(form_fn)

In [19]:
# creating a mapper between the task names for the data and the form
task_form_mapper = {'IgG-PT_day14': 'B1.1.a) Pertussis Toxin ',
                    'IgG-FHA_day14': 'B1.1.b) FHA',       
                    'IgG-PRN_day14': 'B1.1.c) Pertactin',  
                    'Monocytes_day1': 'B.2.c) Monocytes on day 1',
                    'ASCs (Plasmablasts)_day7': 'B.2.a) Plasmablast cells on day 7',
                    'CD4Tcells_day3': 'B.2.b) CD4 TCM cells on 3 days',
                    'CCL3_day3': 'B.3.a) CCL3 on day 3',
                    'IL-6_day3': 'B.3.b) IL-6 on day 3',
                    'NFKBIA_day7': 'B.3.c) NFKBIA at day 7',
                    'XIST_day14': 'B.3.d) XIST on day 14'}

In [20]:
# filling in the form
complete_form = form.copy()

In [26]:
for (task_name, form_name) in task_form_mapper.items():
    print(task_name, '-----------------', form_name)
     
    if task_name in test_preds:
    
        cranks = test_preds[task_name]

        # locate the indexes of the subjects within the form
        form_subject_indexes = form.Subject_ID.isin(cranks.subject_id.tolist())
        form_subject_indexes = form.Subject_ID[form_subject_indexes].index.tolist()

        # update the form for the current taskname
        complete_form.loc[form_subject_indexes, form_name] = cranks['rank'].astype(int)
    

IgG-PT_day14 ----------------- B1.1.a) Pertussis Toxin 
IgG-FHA_day14 ----------------- B1.1.b) FHA
IgG-PRN_day14 ----------------- B1.1.c) Pertactin
Monocytes_day1 ----------------- B.2.c) Monocytes on day 1
ASCs (Plasmablasts)_day7 ----------------- B.2.a) Plasmablast cells on day 7
CD4Tcells_day3 ----------------- B.2.b) CD4 TCM cells on 3 days
CCL3_day3 ----------------- B.3.a) CCL3 on day 3
IL-6_day3 ----------------- B.3.b) IL-6 on day 3
NFKBIA_day7 ----------------- B.3.c) NFKBIA at day 7
XIST_day14 ----------------- B.3.d) XIST on day 14


In [27]:
complete_form

Unnamed: 0,Subject_ID,Age,Biological_Sex_at_Birth,Vaccine_Priming_Status,B1.1.a) Pertussis Toxin,B1.1.b) FHA,B1.1.c) Pertactin,B.1.2.a) IgG1 - Pertussis toxin,B.1.2.b) IgG1 - FHA,B.1.2.c) IgG4 - Pertussis toxin,B.1.2.d) IgG4 - FHA,B.2.a) Plasmablast cells on day 7,B.2.b) CD4 TCM cells on 3 days,B.2.c) Monocytes on day 1,B.3.a) CCL3 on day 3,B.3.b) IL-6 on day 3,B.3.c) NFKBIA at day 7,B.3.d) XIST on day 14
0,61,32,Female,wP,7.0,7.0,27.0,,,,,,,,29,29,1,29
1,62,25,Female,wP,2.0,2.0,16.0,,,,,,,,1,1,29,1
2,63,23,Female,wP,10.0,10.0,3.0,,,,,11.0,5.0,11.0,5,5,5,5
3,64,25,Male,wP,4.0,4.0,12.0,,,,,9.0,11.0,19.0,11,11,11,6
4,65,28,Male,wP,0.0,0.0,29.0,,,,,5.0,9.0,24.0,6,30,6,30
5,66,42,Female,wP,21.0,21.0,30.0,,,,,24.0,4.0,22.0,30,6,30,33
6,67,47,Female,wP,15.0,15.0,18.0,,,,,1.0,24.0,13.0,12,12,12,26
7,68,47,Male,wP,14.0,27.0,31.0,,,,,19.0,16.0,9.0,33,33,33,12
8,69,29,Female,wP,22.0,14.0,1.0,,,,,0.0,13.0,6.0,32,32,32,14
9,70,21,Male,aP,9.0,22.0,5.0,,,,,,,,14,14,23,21


In [28]:
outfn = os.path.join(output_dir, 'Completed_Predictions.tsv')

In [31]:
complete_form.to_csv(outfn, sep='\t', float_format='%.0f', index=False, header=True)

In [30]:
complete_form.

Unnamed: 0,Subject_ID,Age,Biological_Sex_at_Birth,Vaccine_Priming_Status,B1.1.a) Pertussis Toxin,B1.1.b) FHA,B1.1.c) Pertactin,B.1.2.a) IgG1 - Pertussis toxin,B.1.2.b) IgG1 - FHA,B.1.2.c) IgG4 - Pertussis toxin,B.1.2.d) IgG4 - FHA,B.2.a) Plasmablast cells on day 7,B.2.b) CD4 TCM cells on 3 days,B.2.c) Monocytes on day 1,B.3.a) CCL3 on day 3,B.3.b) IL-6 on day 3,B.3.c) NFKBIA at day 7,B.3.d) XIST on day 14
0,61,32,Female,wP,7.0,7.0,27.0,,,,,,,,29,29,1,29
1,62,25,Female,wP,2.0,2.0,16.0,,,,,,,,1,1,29,1
2,63,23,Female,wP,10.0,10.0,3.0,,,,,11.0,5.0,11.0,5,5,5,5
3,64,25,Male,wP,4.0,4.0,12.0,,,,,9.0,11.0,19.0,11,11,11,6
4,65,28,Male,wP,0.0,0.0,29.0,,,,,5.0,9.0,24.0,6,30,6,30
5,66,42,Female,wP,21.0,21.0,30.0,,,,,24.0,4.0,22.0,30,6,30,33
6,67,47,Female,wP,15.0,15.0,18.0,,,,,1.0,24.0,13.0,12,12,12,26
7,68,47,Male,wP,14.0,27.0,31.0,,,,,19.0,16.0,9.0,33,33,33,12
8,69,29,Female,wP,22.0,14.0,1.0,,,,,0.0,13.0,6.0,32,32,32,14
9,70,21,Male,aP,9.0,22.0,5.0,,,,,,,,14,14,23,21
