### Notebook
- Generates dataframes with patient barcode, driver mutation ID, driver mutation affinity, and patient sex and age for easy processing.
- Choose between excluding/including:
    - MSI-H patients
    - Expressed-only driver mutations

In [1]:
import pandas as pd
import numpy as np
import os

# Global variables

In [2]:
MSS = 'MSS_only' # MSS_only, MSS_and_MSI
RM_SEX_SPECIFIC = 'kept_sex_specific' # remove_sex_specific, kept_sex_specific
EXPRESSED_MUT_ONLY = False

In [3]:
# keep only confidently typed patients
confident_patients_I = pd.read_csv('../data/confidently_typed_patients.class_i.csv', index_col=0).index.values
confident_patients_II = pd.read_csv('../data/confidently_typed_patients.class_ii.csv', index_col=0).index.values

driver_phbr_affinities_I = pd.read_csv('../data/driver_mut.class_i.affinities.tsv.gz', sep='\t', index_col=0)
driver_phbr_affinities_II = pd.read_csv('../data/driver_mut.class_ii.affinities.tsv.gz', sep='\t', index_col=0)

print('{} confident MHC-I, {} MHC-II patients'.format(len(set(confident_patients_I)), 
                                                      len(set(confident_patients_II))))

6545 confident MHC-I, 7567 MHC-II patients


In [4]:
# load expressed patients/mutations
if EXPRESSED_MUT_ONLY:
    print('Loading mutation expression info')
    exp_mut_df = pd.read_csv('../data/expressed_driver_mutations.csv', index_col=0)
    unexp_mut_df = pd.read_csv('../data/unexpressed_driver_mutations.csv', index_col=0)

## MHC-I
- Create expressed dataframe with patients and their respective driver mutation affinities

In [5]:
# create expressed PHBR df
exp_phbrI_scores = [driver_phbr_affinities_I.loc[exp_mut_df.index.values[i],exp_mut_df['exp_mutations'].values[i]] \
                     for i in range(len(exp_mut_df))]
exp_phbrI_df = pd.DataFrame({'scores': exp_phbrI_scores}, index=exp_mut_df.index)

# retain confidently typed patients only
exp_phbrI_df = exp_phbrI_df[exp_phbrI_df.index.isin(confident_patients_I)]

# add disease column
barcode_disease_df = pd.read_csv('../data/uuid_barcode_map.txt.gz', 
                                 sep='\t', usecols=['barcode', 'disease'])
barcode_disease_df.drop_duplicates(inplace=True)
barcode_disease_df.set_index('barcode', inplace=True)

exp_phbrI_df['disease'] = [barcode_disease_df.loc[x,'disease'].split('-')[1] for x in exp_phbrI_df.index.values]
exp_phbrI_df.head(2)

Unnamed: 0,scores,disease
TCGA-85-8580,0.120879,LUSC
TCGA-A5-A0GV,3.524218,UCEC


In [11]:
# load clinical data
tcga_clinical = pd.read_csv('../data/all_clinical_tcga.txt.gz', sep='\t', index_col='bcr_patient_barcode')
tcga_clinical['age_at_initial_pathologic_diagnosis'] = pd.to_numeric(tcga_clinical['age_at_initial_pathologic_diagnosis'], errors='coerce')
tcga_clinical.dropna(subset=['age_at_initial_pathologic_diagnosis'], inplace=True)

# how many patients have clinical info?
before_count = len(set(exp_phbrI_df.index.values))
exp_phbrI_df = exp_phbrI_df[exp_phbrI_df.index.isin(tcga_clinical.index)]
after_count = len(set(exp_phbrI_df.index.values))
print('Dropped {} patients - no clinical info'.format(before_count-after_count))

# add sex/age columns 
exp_phbrI_df['age'] = [tcga_clinical.loc[x,'age_at_initial_pathologic_diagnosis'] for x in exp_phbrI_df.index.values]
exp_phbrI_df['sex'] = [tcga_clinical.loc[x,'gender'].lower() for x in exp_phbrI_df.index.values]
exp_phbrI_df.head(2)

In [7]:
# drop MSI-H patients if applicable 
if MSS == 'MSS_only':
    mss_patients = np.loadtxt('../data/mss_tcga_patients.txt', dtype=str)
    before_count = len(set(exp_phbrI_df.index.values))
    exp_phbrI_df = exp_phbrI_df[exp_phbrI_df.index.isin(mss_patients)]
    after_count = len(set(exp_phbrI_df.index.values))
    
    print('Dropped {} MSI-H patients'.format(before_count-after_count))

Dropped 194 MSI-H patients


In [8]:
# drop patients with sex-specific cancers if applicable 
if RM_SEX_SPECIFIC == 'remove_sex_specific':
    sex_specific_diseases = ['UCS', 'PRAD', 'BRCA', 'UCEC', 'CESC', 'TGCT', 'OV']
    before_count = len(set(exp_phbrI_df.index.values))
    exp_phbrI_df = exp_phbrI_df[~exp_phbrI_df['disease'].isin(sex_specific_diseases)]
    after_count = len(set(exp_phbrI_df.index.values))
    
    print('Dropped {} patients with sex-specific diseases'.format(before_count-after_count))

In [9]:
# save to file
savepath = '../generated_data/confident_patient_I.age_sex_disease.{}.{}.tsv'.format(MSS, RM_SEX_SPECIFIC)
print('Saved to {}'.format(savepath))
exp_phbrI_df.to_csv(savepath, sep='\t')

Saved to ../generated_data/confident_patient_I.age_sex_disease.MSS_only.kept_sex_specific.tsv


## MHC-II
- Create expressed dataframe with patients and their respective driver mutation affinities
- Some patients do not have MHC-II driver affinities because their typed MHC-II types are not NetMHCIIpan compatible.

In [None]:
# create expressed PHBR df
exp_phbrI_scores = [driver_phbr_affinities_I.loc[exp_mut_df.index.values[i],exp_mut_df['exp_mutations'].values[i]] \
                     for i in range(len(exp_mut_df))]
exp_phbrI_df = pd.DataFrame({'scores': exp_phbrI_scores}, index=exp_mut_df.index)

In [8]:
# create expressed PHBR-II df
exp_phbrII_scores, total_patients = [],[]

for i in range(len(exp_mut_df)):
    patient = exp_mut_df.index.values[i]
    if patient in driver_phbr_affinities_II.index:
        exp_phbrII_scores.append(driver_phbr_affinities_II.loc[exp_mut_df.index.values[i], 
                                                               exp_mut_df['exp_mutations'].values[i]])
        total_patients.append(patient)
        
exp_phbrII_df = pd.DataFrame({'scores': exp_phbrII_scores}, index=total_patients)

# retain confidently typed patients only
exp_phbrII_df = exp_phbrII_df[exp_phbrII_df.index.isin(confident_patients_II)]

# add disease column
barcode_disease = pd.read_csv('../data/uuid_barcode_map.txt.gz', 
                              sep='\t', usecols=['barcode', 'disease'])
barcode_disease.drop_duplicates(inplace=True)
barcode_disease.set_index('barcode', inplace=True)

exp_phbrII_df['disease'] = [barcode_disease.loc[x,'disease'].split('-')[1] for x in exp_phbrII_df.index.values]
exp_phbrII_df.head(2)

Unnamed: 0,scores,disease
TCGA-A5-A0GV,11.085044,UCEC
TCGA-A5-A0GV,2.904973,UCEC


In [12]:
# how many patients have clinical info?
before_count = len(set(exp_phbrII_df.index.values))
exp_phbrII_df = exp_phbrII_df[exp_phbrII_df.index.isin(tcga_clinical.index)]
after_count = len(set(exp_phbrII_df.index.values))
print('Dropped {} patients - no clinical info'.format(before_count-after_count))

# add age/sex column
exp_phbrII_df['age'] = [tcga_clinical.loc[x,'age_at_initial_pathologic_diagnosis'] \
                        for x in exp_phbrII_df.index.values]
exp_phbrII_df['sex'] = [tcga_clinical.loc[x,'gender'].lower() for x in exp_phbrII_df.index.values]
exp_phbrII_df.head(2)

Dropped 28 patients - no clinical info


Unnamed: 0,scores,disease,age,sex
TCGA-A5-A0GV,11.085044,UCEC,67.0,female
TCGA-A5-A0GV,2.904973,UCEC,67.0,female


In [13]:
if MSS == 'MSS_only':
    mss_patients = np.loadtxt('../data/mss_tcga_patients.txt', dtype=str)
    before_count = len(set(exp_phbrII_df.index.values))
    exp_phbrII_df = exp_phbrII_df[exp_phbrII_df.index.isin(mss_patients)]
    after_count = len(set(exp_phbrII_df.index.values))
    
    print('Dropped {} MSI-H patients'.format(before_count-after_count))

Dropped 141 MSI-H patients


In [14]:
if RM_SEX_SPECIFIC == 'remove_sex_specific':
    sex_specific_diseases = ['UCS', 'PRAD', 'BRCA', 'UCEC', 'CESC', 'TGCT', 'OV']
    before_count = len(set(exp_phbrII_df.index.values))
    exp_phbrII_df = exp_phbrII_df[~exp_phbrII_df['disease'].isin(sex_specific_diseases)]
    after_count = len(set(exp_phbrII_df.index.values))
    
    print('Dropped {} patients with sex-specific diseases'.format(before_count-after_count))

In [15]:
# save to file
savepath = '../generated_data/confident_patient_II.age_sex_disease.{}.{}.tsv'.format(MSS, RM_SEX_SPECIFIC)
print('Saved to {}'.format(savepath))
exp_phbrII_df.to_csv(savepath, sep='\t')

Saved to ../generated_data/confident_patient_II.age_sex_disease.MSS_only.kept_sex_specific.tsv
