### Notebook
- Prepares validation patient data for figure generation

In [1]:
import pandas as pd
import os

### 0. Load data

In [2]:
val_dir = '../data/validation_data'
project_list = set(['.'.join(x.split('.')[:-2]) for x in os.listdir(val_dir)])

In [16]:
# join all driver mutations
phbr_i_df = pd.DataFrame()
phbr_ii_df = pd.DataFrame()

for project in project_list:
    phbr_i_path = os.path.join(val_dir, '{}.PHBR_I.csv'.format(project))
    phbr_i_df = phbr_i_df.append(pd.read_csv(phbr_i_path, index_col=0), sort=True)
    
    phbr_ii_path = os.path.join(val_dir, '{}.PHBR_II.csv'.format(project))
    phbr_ii_df = phbr_ii_df.append(pd.read_csv(phbr_ii_path, index_col=0), sort=True)

phs001451.v1.p1.c1_PhaseIMelanomaVac
['Pt1' 'Pt1' 'Pt2' 'Pt2' 'Pt2' 'Pt2' 'Pt3' 'Pt3' 'Pt4' 'Pt5' 'Pt7' 'Pt7'
 'Pt8']


phs000452.v2.p1.c1_MelanomaGenomeSeq
['Pt1' 'Pt1' 'Pt2' 'Pt2' 'Pt2' 'Pt2' 'Pt3' 'Pt3' 'Pt4' 'Pt5' 'Pt7' 'Pt7'
 'Pt8' '13468' '13575' '13591' '13600' 'JWCI-WGS-11' 'JWCI-WGS-11'
 'JWCI-WGS-12' 'JWCI-WGS-14' 'JWCI-WGS-14' 'JWCI-WGS-15' 'JWCI-WGS-15'
 'JWCI-WGS-19' 'JWCI-WGS-19' 'JWCI-WGS-23' 'JWCI-WGS-25' 'JWCI-WGS-25'
 'JWCI-WGS-27' 'JWCI-WGS-27' 'JWCI-WGS-27' 'JWCI-WGS-29' 'JWCI-WGS-3'
 'JWCI-WGS-3' 'JWCI-WGS-3' 'JWCI-WGS-32' 'JWCI-WGS-35' 'JWCI-WGS-37'
 'JWCI-WGS-39' 'JWCI-WGS-39' 'JWCI-WGS-4' 'JWCI-WGS-4' 'JWCI-WGS-43'
 'JWCI-WGS-8']


phs000980.v1.p1.c1_NSCLC
['Pt1' 'Pt1' 'Pt2' 'Pt2' 'Pt2' 'Pt2' 'Pt3' 'Pt3' 'Pt4' 'Pt5' 'Pt7' 'Pt7'
 'Pt8' '13468' '13575' '13591' '13600' 'JWCI-WGS-11' 'JWCI-WGS-11'
 'JWCI-WGS-12' 'JWCI-WGS-14' 'JWCI-WGS-14' 'JWCI-WGS-15' 'JWCI-WGS-15'
 'JWCI-WGS-19' 'JWCI-WGS-19' 'JWCI-WGS-23' 'JWCI-WGS-25' 'JWCI-WGS-25'
 'JWCI-WGS-27' 'JWCI-WGS-27' 

EGA_WGS
['Pt1' 'Pt1' 'Pt2' 'Pt2' 'Pt2' 'Pt2' 'Pt3' 'Pt3' 'Pt4' 'Pt5' 'Pt7' 'Pt7'
 'Pt8' '13468' '13575' '13591' '13600' 'JWCI-WGS-11' 'JWCI-WGS-11'
 'JWCI-WGS-12' 'JWCI-WGS-14' 'JWCI-WGS-14' 'JWCI-WGS-15' 'JWCI-WGS-15'
 'JWCI-WGS-19' 'JWCI-WGS-19' 'JWCI-WGS-23' 'JWCI-WGS-25' 'JWCI-WGS-25'
 'JWCI-WGS-27' 'JWCI-WGS-27' 'JWCI-WGS-27' 'JWCI-WGS-29' 'JWCI-WGS-3'
 'JWCI-WGS-3' 'JWCI-WGS-3' 'JWCI-WGS-32' 'JWCI-WGS-35' 'JWCI-WGS-37'
 'JWCI-WGS-39' 'JWCI-WGS-39' 'JWCI-WGS-4' 'JWCI-WGS-4' 'JWCI-WGS-43'
 'JWCI-WGS-8' 'AL4602' 'AL4602' 'AU5884' 'BL3403' 'CA9903' 'CA9903'
 'DI6359' 'HE3202' 'HE3202' 'HE3202' 'KA3947' 'KA3947' 'LO3793' 'LO5004'
 'M4945' 'RI1933' 'RI1933' 'RO3338' 'RO3338' 'SC0899' 'SC6470' 'SR070761'
 'TU0428' 'ZA6965' 'BLADDER-15330_CCPM_0700691'
 'BLADDER-15330_CCPM_0700692' 'BLADDER-15330_CCPM_0700694'
 'BLADDER-15330_CCPM_0700694' 'BLCA-001' 'BLCA-002' 'BLCA-006' 'BLCA-010'
 'BLCA-012' 'DFCI_MM_2' 'HNSCC-258' 'HNSCC-287' 'HNSCC-305' 'HNSCC-323'
 'LUNG-2630' 'DM005' 'DM005' 'DM00

### 1. Add age categorical column

In [4]:
young_thresh = 52
old_thresh = 68

In [5]:
# MHC-I
phbr_i_df['age_categorical_3070'] = pd.cut(phbr_i_df['age'], [0, young_thresh, old_thresh, 200], 
                                           labels=['younger', 'middle', 'older'])

# MHC-II
phbr_ii_df['age_categorical_3070'] = pd.cut(phbr_ii_df['age'], [0, young_thresh, old_thresh, 200], 
                                            labels=['younger', 'middle', 'older'])

In [6]:
# save
savepath_i = '../generated_data/validation_patients_I.age_sex_disease.tsv'
phbr_i_df.to_csv(savepath_i, sep='\t')

savepath_ii = '../generated_data/validation_patients_II.age_sex_disease.tsv'
phbr_ii_df.to_csv(savepath_ii, sep='\t')

In [7]:
len(phbr_ii_df)

654

In [17]:
[x for x in l.index.values if x not in phbr_ii_df.index]

['1',
 '4',
 '4',
 '5',
 '6',
 '18',
 '18',
 '20',
 '21',
 '25',
 '28',
 '33',
 '47',
 '49',
 '49',
 '50',
 '50',
 '50',
 '54',
 '54']

In [19]:
l = pd.read_csv('/cellar/users/andreabc/Data/hla_sex/validation/clean_data/boxplot_data.MHC-I.tsv', sep='\t', index_col=0)
print(l.shape)
l.head()

(732, 7)


Unnamed: 0,PHBR_score,age,age_categorical_3070,mutation_id,project_code,sex,tumor_type
DO27765,0.985454,4.0,younger,M_TP53_S215R,,male,MALY
DO27775,5.304091,4.0,younger,M_TP53_R273H,,male,MALY
DO27819,0.283287,5.0,younger,M_TP53_P190L,,male,MALY
DO27821,3.406682,10.0,younger,M_SMARCA4_G1232S,,male,MALY
DO27833,7.790538,64.0,middle,M_TP53_R273H,,male,MALY


In [14]:
l

Unnamed: 0,PHBR_score,age,age_categorical_3070,mutation_id,project_code,sex,tumor_type
Pt1,30.995876,26.0,younger,M_BRAF_V600E,,male,Melanoma
Pt1,2.061328,26.0,younger,M_LUM_G248E,,male,Melanoma
Pt2,13.886261,68.0,middle,M_FGFR2_R210Q,,female,Melanoma
Pt2,13.886261,68.0,middle,M_FGFR2_R210Q,,female,Melanoma
Pt2,13.886261,68.0,middle,M_FGFR2_R210Q,,female,Melanoma
Pt2,6.696794,68.0,middle,M_TP53_R248Q,,female,Melanoma
Pt3,28.391047,51.0,younger,M_NRAS_Q61K,,female,Melanoma
Pt3,56.371040,51.0,younger,M_CTNNB1_T41I,,female,Melanoma
Pt4,28.503359,56.0,middle,M_NRAS_Q61H,,male,Melanoma
Pt5,3.087546,58.0,middle,M_NRAS_G13R,,female,Melanoma
