### Notebook
- Creates dataframe ready for input into R gamm4/mgcv::gam functions for both discovery and validation cohorts

In [10]:
import pandas as pd
import numpy as np
import os

### 1. Discovery cohort
- TCGA

In [11]:
# variables
MSS_only = 'MSS_only'
remove_sex_specific = 'remove_sex_specific.'
use_expression = True

##### 1A. Load data
- Use confidently typed patients 
- Use only patients with MHC-I and MHC-II affinities (some types not compatible with NetMHCpan)
- Add sex and age data

In [12]:
driver_mut_ids = np.loadtxt('../data/driver_mut_IDs.txt', dtype=str)

In [13]:
# load patients
# confident_patients_I = pd.read_csv('../generated_data/confident_patient_I.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)
# confident_patients_II = pd.read_csv('../generated_data/confident_patient_II.age_sex_disease.{}.{}tsv'.format(MSS_only, remove_sex_specific), sep='\t', index_col=0)
comprehensive_patient_df = pd.read_csv('../data/comprehensive_patient_df.tsv', sep='\t')
comprehensive_patient_df.head(2)

Unnamed: 0,index,PHBR-II_score,PHBR-I_score,Reference_Allele_readcounts,Total_readcounts,Tumor_Seq_Allele1_readcounts,Tumor_Seq_Allele2_readcounts,age_at_initial_pathologic_diagnosis,confidently_typed_MHC-I,confidently_typed_MHC-II,disease,driver_mutation,gender,num_tools_that_confirm,VAF,VAF_percentile_rank,t_alt_count
0,TCGA-02-0003,64.73698128865948,4.564100226222489,,,,,50.0,True,True,TCGA-GBM,M_TP53_R282W,MALE,4,0.444254,42.500799,48.0
1,TCGA-02-0003,23.79529180594853,0.5903186887087578,,,,,50.0,True,True,TCGA-GBM,M_PIK3R1_G376R,MALE,4,0.279602,89.727463,26.0


In [14]:
# Use only expressed mutations
print('Using expression')

# binary matrix of first driver
binary_mut_list = []
patient_id_list = []

for patient_id in comprehensive_patient_df['index'].unique():

    patient_df = comprehensive_patient_df[comprehensive_patient_df['index']==patient_id]
    patient_df = patient_df[patient_df['Tumor_Seq_Allele2_readcounts']>=5]

    # keep drivers with >= 5 reads
    if len(patient_df)>0:
        patient_df = patient_df.iloc[[0]]
    patient_id_list.append(patient_id)

    if len(patient_df) > 0:
        l = [0]*len(driver_mut_ids)
        for i, mut in enumerate(driver_mut_ids):
            if mut in patient_df['driver_mutation'].values:
                l[i] = 1
        binary_mut_list.append(l)
    else:
        binary_mut_list.append([0]*len(driver_mut_ids))

# save 
mut = pd.DataFrame(binary_mut_list, index=patient_id_list, columns=driver_mut_ids)

print(len(mut))

Using expression
5234


In [15]:
all_patients = mut.index.unique()
print(len(all_patients))

5234


In [16]:
# subset phbr affinities
phbrI = pd.read_csv('../data/driver_mut.class_i.affinities.tsv.gz', sep='\t', index_col=0)
phbrI = phbrI[phbrI.index.isin(all_patients)]


phbrII = pd.read_csv('../data/driver_mut.class_ii.affinities.tsv.gz', sep='\t', index_col=0)
phbrII = phbrII[phbrII.index.isin(all_patients)]

len(phbrI), len(phbrII)

(5233, 3789)

In [19]:
intersect_patients = list(set(phbrI.index.values).intersection(set(phbrII.index.values).intersection(set(mut.index.values))))
mut = mut[mut.index.isin(intersect_patients)]
print('{} patients in both I and II'.format(len(intersect_patients)))

3788 patients in both I and II


In [20]:
# binarize gender
tcga_clinical = pd.read_csv('../data/all_clinical_tcga.txt.gz', sep='\t', index_col='bcr_patient_barcode')

sex = tcga_clinical[tcga_clinical.index.isin(intersect_patients)][['gender']]
sex['gender'] = sex['gender'].replace({'FEMALE': 1, 'MALE': 0})
sex.sort_index(inplace=True)

age = tcga_clinical[tcga_clinical.index.isin(intersect_patients)][['age_at_initial_pathologic_diagnosis']]
age['age'] = pd.to_numeric(age['age_at_initial_pathologic_diagnosis'], errors='coerce')
age.dropna(inplace=True)
age.sort_index(inplace=True)

disease = tcga_clinical[tcga_clinical.index.isin(intersect_patients)][['disease']]
disease.sort_index(inplace=True)

##### 1B. Collapse into single dataframe
- Use mutations that appear ≥n times

In [21]:
# filter mutations by threshold
thresh = 2

thresh_mut_df = pd.DataFrame(mut.sum())
thresh_mut = thresh_mut_df[thresh_mut_df[0]>thresh].index.values
print('{} mutations left'.format(len(thresh_mut)))

mut = mut[thresh_mut]
phbrI = phbrI[thresh_mut]
phbrII = phbrII[thresh_mut]

202 mutations left


In [69]:
# make sure all sorted by patient the same way
mut.sort_index(inplace=True)
phbrI.sort_index(inplace=True)
phbrII.sort_index(inplace=True)

# sorted mutations
mut.sort_index(axis=1, inplace=True)
phbrI.sort_index(axis=1, inplace=True)
phbrII.sort_index(axis=1, inplace=True)

# log PHBR values
phbrI_log = phbrI.applymap(np.log)
phbrII_log = phbrII.applymap(np.log)

In [70]:
# collapse patient IDs and sex
patients_repeated = pd.DataFrame(np.tile(mut.index.values, (len(mut.columns),1)).T)
sex_repeated = pd.DataFrame(np.tile(sex.gender.values, (len(mut.columns),1)).T)
age_repeated = pd.DataFrame(np.tile(age.age_at_initial_pathologic_diagnosis.values, (len(mut.columns),1)).T)
disease_repeated = pd.DataFrame(np.tile(disease.disease.values, (len(mut.columns),1)).T)

In [71]:
# prepare columns
y = mut.values.reshape((len(mut.index)*len(mut.columns))).tolist()
x_phbrI_log = phbrI_log.values.reshape((len(phbrI_log.index)*len(phbrI_log.columns))).tolist()
x_phbrII_log = phbrII_log.values.reshape((len(phbrII_log.index)*len(phbrII_log.columns))).tolist()

x_phbrI = phbrI.values.reshape((len(phbrI.index)*len(phbrI.columns))).tolist()
x_phbrII = phbrII.values.reshape((len(phbrII.index)*len(phbrII.columns))).tolist()

patient_ids = patients_repeated.values.reshape((len(patients_repeated.index)*len(patients_repeated.columns))).tolist()
sex = sex_repeated.values.reshape((len(sex_repeated.index)*len(sex_repeated.columns))).tolist()
age = age_repeated.values.reshape((len(age_repeated.index)*len(age_repeated.columns))).tolist()
disease = disease_repeated.values.reshape((len(disease_repeated.index)*len(disease_repeated.columns))).tolist()

In [72]:
output_df = pd.DataFrame({'y (has_mutation)': y, 'log_phbrI': x_phbrI_log, 'log_phbrII': x_phbrII_log, 
                          'phbrI': x_phbrI, 'phbrII': x_phbrII, 
                          'patient_ids': patient_ids, 
                          'sex': sex, 'age': age, 'disease': disease})

output_df['centered_log_phbrI'] = output_df['log_phbrI'] - np.mean(output_df['log_phbrI'].values)
output_df['centered_log_phbrII'] = output_df['log_phbrII'] - np.mean(output_df['log_phbrII'].values)
output_df['centered_sex'] = output_df['sex'] - np.mean(output_df['sex'].values)
output_df['centered_phbrI'] = output_df['phbrI'] - np.mean(output_df['phbrI'].values)
output_df['centered_phbrII'] = output_df['phbrII'] - np.mean(output_df['phbrII'].values)
output_df['age'] = pd.to_numeric(output_df['age'])
output_df['centered_age'] = output_df['age'] - np.mean(output_df['age'].values)

output_df.head(2)

Unnamed: 0,y (has_mutation),log_phbrI,log_phbrII,phbrI,phbrII,patient_ids,sex,age,disease,centered_log_phbrI,centered_log_phbrII,centered_sex,centered_phbrI,centered_phbrII,centered_age
0,0,4.101671,4.37053,60.441227,79.085534,TCGA-02-0047,0,78,TCGA-GBM,3.890015,1.754338,-0.40795,57.648637,56.889547,19.165272
1,0,0.790288,3.373374,2.204031,29.176802,TCGA-02-0047,0,78,TCGA-GBM,0.578631,0.757182,-0.40795,-0.58856,6.980815,19.165272


In [73]:
# save dataframe
savepath = '../generated_data/gam_input.expressed_mutations.{}.csv.gz'.format(thresh)
print('Saving to {}'.format(savepath))
output_df.to_csv(savepath, index=False, compression='gzip')

Saving to ../generated_data/gam_input.expressed_mutations.2.csv.gz


### 2. Validation cohort

##### 2A. Load data
- Use only patients with MHC-I and MHC-II affinities (some types not compatible with NetMHCpan)
- Add sex and age data

In [38]:
val_dir = '../data/validation_data'
project_list = set(['.'.join(x.split('.')[:-2]) for x in os.listdir(val_dir) if 'PHBR' in x])

In [42]:
# join affinities and mutations
binary_mut_df = pd.DataFrame()
phbr_i_df = pd.DataFrame()
phbr_ii_df = pd.DataFrame()

for project in project_list:
    mut_path = os.path.join(val_dir, 'binary_driver_mut.{}.matrix'.format(project))
    binary_mut_df = binary_mut_df.append(pd.read_csv(mut_path, sep='\t', index_col=0), sort=True)
    
    phbr_i_path = os.path.join(val_dir, 'driver_mut.class_i.{}.affinities'.format(project))
    phbr_i_df = phbr_i_df.append(pd.read_csv(phbr_i_path, index_col=0, sep='\t'), sort=True)
    
    phbr_ii_path = os.path.join(val_dir, 'driver_mut.class_ii.{}.affinities'.format(project))
    phbr_ii_df = phbr_ii_df.append(pd.read_csv(phbr_ii_path, index_col=0, sep='\t'), sort=True)

In [43]:
# filter mutations by threshold
# thresh = 2

# thresh_mut_df = pd.DataFrame(binary_mut_df.sum())
# thresh_mut = thresh_mut_df[thresh_mut_df[0]>thresh].index.values
# print(len(thresh_mut))

# binary_mut_df = binary_mut_df[thresh_mut]
# phbr_i_df = phbr_i_df[thresh_mut]
# phbr_ii_df = phbr_ii_df[thresh_mut]

# make sure index is string
binary_mut_df.index = binary_mut_df.index.map(str)
phbr_i_df.index = phbr_i_df.index.map(str)
phbr_ii_df.index = phbr_ii_df.index.map(str)

# drop patients without sex/age information
intersect_patients = list(set(phbr_i_df.index.values).intersection(set(phbr_ii_df.index.values)).intersection(set(binary_mut_df.index.values)))
binary_mut_df = binary_mut_df[binary_mut_df.index.isin(intersect_patients)]
phbr_i_df = phbr_i_df[phbr_i_df.index.isin(intersect_patients)]
phbr_ii_df = phbr_ii_df[phbr_ii_df.index.isin(intersect_patients)]
print('{} patients in both I and II'.format(len(intersect_patients)))

937 patients in both I and II


In [46]:
# gather sex and age dataframes

# load sex data and binarize
sex = pd.read_csv('../data/validation_data/patient_sex.tsv', sep='\t', index_col=0)
sex = sex[sex.index.isin(intersect_patients)]
sex['sex'] = sex['sex'].replace({'female': 1, 'male': 0}) # female 1, male 0

# age
age = pd.read_csv('../data/validation_data/patient_age.tsv', sep='\t', index_col=0)
age = age[age.index.isin(intersect_patients)]

# drop age/sex if no data
patients_with_both_age_sex = set(sex.index).intersection(set(age.index))

sex = sex[sex.index.isin(patients_with_both_age_sex)]
sex = sex.reset_index().drop_duplicates().set_index('index')
sex.sort_index(inplace=True)

age = age[age.index.isin(patients_with_both_age_sex)]
age = age.reset_index().drop_duplicates().set_index('index')
age.sort_index(inplace=True)

In [48]:
binary_mut_df = binary_mut_df[binary_mut_df.index.isin(patients_with_both_age_sex)].reset_index().drop_duplicates(subset=['index']).set_index('index')
phbr_i_df = phbr_i_df[phbr_i_df.index.isin(patients_with_both_age_sex)].reset_index().drop_duplicates(subset=['index']).set_index('index')
phbr_ii_df = phbr_ii_df[phbr_ii_df.index.isin(patients_with_both_age_sex)].reset_index().drop_duplicates(subset=['index']).set_index('index')

print(len(binary_mut_df.index.values))
print(len(phbr_i_df.index.values))
print(len(phbr_ii_df.index.values))
print(len(sex.index.values))
print(len(age.index.values))

937
937
937
937
937


In [49]:
# make sure all sorted by patient the same way
binary_mut_df.sort_index(inplace=True)
phbr_i_df.sort_index(inplace=True)
phbr_ii_df.sort_index(inplace=True)

# sorted mutations
binary_mut_df.sort_index(axis=1, inplace=True)
phbr_i_df.sort_index(axis=1, inplace=True)
phbr_ii_df.sort_index(axis=1, inplace=True)

# log PHBR values
phbr_i_log = phbr_i_df.applymap(np.log)
phbr_ii_log = phbr_ii_df.applymap(np.log)

In [50]:
# repeat patient IDs and sex
patients_repeated = pd.DataFrame(np.tile(binary_mut_df.index.values, (len(binary_mut_df.columns),1)).T)
sex_repeated = pd.DataFrame(np.tile(sex['sex'].values, (len(binary_mut_df.columns),1)).T)
age_repeated = pd.DataFrame(np.tile(age['age'].values, (len(binary_mut_df.columns),1)).T)
print(len(patients_repeated))

937


In [51]:
# prepare model vectors
y = binary_mut_df.values.reshape((len(binary_mut_df.index)*len(binary_mut_df.columns))).tolist()
x_phbr_i_log = phbr_i_log.values.reshape((len(phbr_i_log.index)*len(phbr_i_log.columns))).tolist()
x_phbr_ii_log = phbr_ii_log.values.reshape((len(phbr_ii_log.index)*len(phbr_ii_log.columns))).tolist()

x_phbr_i = phbr_i_df.values.reshape((len(phbr_i_df.index)*len(phbr_i_df.columns))).tolist()
x_phbr_ii = phbr_ii_df.values.reshape((len(phbr_ii_df.index)*len(phbr_ii_df.columns))).tolist()

patient_ids = patients_repeated.values.reshape((len(patients_repeated.index)*len(patients_repeated.columns))).tolist()
sex = sex_repeated.values.reshape((len(sex_repeated.index)*len(sex_repeated.columns))).tolist()
age = age_repeated.values.reshape((len(age_repeated.index)*len(age_repeated.columns))).tolist()

In [52]:
print(len(y))
print(len(x_phbr_i_log))
print(len(age))
print(len(sex))

952929
952929
952929
952929


In [55]:
val_output_df = pd.DataFrame({'y (has_mutation)': y, 'log_phbr_i': x_phbr_i_log, 'log_phbr_ii': x_phbr_ii_log, 
                              'phbr_i': x_phbr_i, 'phbr_ii': x_phbr_ii, 
                              'patient_ids': patient_ids, 'sex': sex, 'age': age})

val_output_df['centered_log_phbr_i'] = val_output_df['log_phbr_i'] - np.mean(val_output_df['log_phbr_i'].values)
val_output_df['centered_log_phbr_ii'] = val_output_df['log_phbr_ii'] - np.mean(val_output_df['log_phbr_ii'].values)
val_output_df['centered_sex'] = val_output_df['sex'] - np.nanmean(val_output_df['sex'].values)
val_output_df['centered_age'] = val_output_df['age'] - np.nanmean(val_output_df['age'].values)

val_output_df.to_csv('../generated_data/gam_input.validation.all.csv.gz'.format(thresh), 
                     index=False, compression='gzip')
val_output_df.head(2)

Unnamed: 0,y (has_mutation),log_phbr_i,log_phbr_ii,phbr_i,phbr_ii,patient_ids,sex,age,centered_log_phbr_i,centered_log_phbr_ii,centered_sex,centered_age
0,0,0.436009,1.22079,1.546523,3.389865,1,1.0,48.0,0.107013,-1.411619,0.580128,-11.021324
1,0,1.096289,1.008966,2.993039,2.742764,1,1.0,48.0,0.767293,-1.623443,0.580128,-11.021324
