In [1]:
%load_ext autoreload
%autoreload 2
#%matplotlib widget
#%matplotlib ipympl

#%reload_ext tensorboard
#%matplotlib qt

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from hnc_project.pytorch.run_model_torch import RunModel

from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC

In [33]:
clinical_dummies = {}

cancer_subsite_to_site_map = {
    'unknown': 'nos',
    'benign tumor': 'nos',
    'other': 'nos',
    'base of tongue': 'oropharynx',
    'tonsil': 'oropharynx',
    'soft palate': 'oropharynx',
    'glossopharyngeal sulcus': 'medulla oblongata',
    'major salivary gland': 'salivary gland',
    'tonsillar pillar': 'oropharynx',
    'lacrimal gland': 'orbit',
    'lip & oral cavity': 'oral cavity',
    
}


cancer_site_map = {
    'nos': 0,
    'esophagus': 1,
    'hypopharynx': 2,
    'larynx': 3,
    'nasal cavity': 4,
    'nasopharynx': 5,
    'oropharynx': 6,
    'paranasal sinus': 8,
    'salivary glands': 9,
    'sarcoma': 10,
    'skin': 11,
    'medulla oblongata': 12,
    'orbit': 13,
    'paraganglioma': 14,
    'oral cavity': 15,
}

### HNSCC clinical features

In [34]:
from hnc_project.pytorch.gen_params_torch_cfg import model_config
data_directory = '../../data/HNSCC'
patch_path = Path('../../data/HNSCC/HNSCC_Nii_222_50_50_60_Crop_v2')
data_path = Path(data_directory)
clinical_info = pd.read_csv(data_path.joinpath('Radiomics_Outcome_Prediction_in_OPC_ASRM_corrected.csv'))
clinical_info.set_index('TCIA Radiomics dummy ID of To_Submit_Final', inplace=True)
#model = RunModel(model_config) 
#model.set_data(radiomics_dir=radiomics_directory, version='image_only_noaug')

In [36]:
features_to_pull = [
    'Age at Diag',
    'Total prescribed Radiation treatment dose',
    'Cancer subsite of origin',
    'Gender',
    #'Tumor laterality',
    'T-category',
    'N-category',
    'Therapeutic Combination',
    #'AJCC Stage (7th edition)',
    #'Smoking status',
    #'Therapeutic Combination',
]
categorical_features = [
    'Gender',
    #'Tumor laterality',
    #'Cancer subsite of origin',
    'T-category',
    'N-category',
    'AJCC Stage (7th edition)',
    'Therapeutic Combination',
]

pulled_features = clinical_info[features_to_pull].copy(deep=True)
for pat in pulled_features.index:
    subsite = pulled_features.loc[pat, 'Cancer subsite of origin'].lower()
    if subsite in cancer_subsite_to_site_map.keys():
        site = cancer_subsite_to_site_map[subsite]
    else:
        site = subsite
    pulled_features.loc[pat, 'Cancer subsite of origin'] = cancer_site_map[site]
    ther_comb = pulled_features.loc[pat, 'Therapeutic Combination'].lower()
    if 'chemo' in ther_comb:
        pulled_features.loc[pat, 'Therapeutic Combination'] = 'chemo_radio'
    else:
        pulled_features.loc[pat, 'Therapeutic Combination'] = 'radio'
pulled_features[['T-category', 'N-category']] = pulled_features[['T-category', 'N-category']].astype(str)
pulled_features['Cancer subsite of origin'] = pulled_features['Cancer subsite of origin'].astype(int)

#pulled_features['N-category'] = pulled_features['N-category'].astype(str)
clinical_dummies['HNSCC'] = pd.get_dummies(pulled_features)
#model.data.clinical_features = pulled_features.loc[model.data.patients]
name_mapping = {
'Age at Diag': 'age',
'Total prescribed Radiation treatment dose': 'prescribed_dose',
'Gender_Female': 'gender_female',
'Gender_Male': 'gender_male',
#'Tumor laterality_L': 'laterality_l',
#'Tumor laterality_Midline': 'laterality_m',
#'Tumor laterality_NOS': 'laterality_nos',
#'Tumor laterality_R': 'laterality_r',
'Cancer subsite of origin': 'primary_site',
#'Cancer subsite of origin_Base of tongue': 'primary_bot',
#'Cancer subsite of origin_Glossopharyngeal sulcus': 'primary_gloss_sulc',
#'Cancer subsite of origin_NOS': 'primary_nos',
#'Cancer subsite of origin_Soft palate': 'primary_soft_palate',
#'Cancer subsite of origin_Tonsil': 'primary_tonsil',
'T-category_1': 'tcat_1',
'T-category_2': 'tcat_2',
'T-category_3': 'tcat_3',
'T-category_4': 'tcat_4',
'N-category_0': 'ncat_0',
'N-category_1': 'ncat_1',
'N-category_2': 'ncat_2',
'N-category_3': 'ncat_3',
'Therapeutic Combination_chemo_radio': 'therapy_chemo_radio',
'Therapeutic Combination_radio': 'therapy_radio',
#'Smoking status_Former': 'smoking_status_former',
#'Smoking status_Never': 'smoking_status_never',
#'Smoking status_Current': 'smoking_status_current',
#'Therapeutic Combination_Concurrent chemoradiotherapy',
#'Therapeutic Combination_Induction chemotherapy + concurrent chemoradiotherapy',
#'Therapeutic Combination_Induction chemotherapy+Radiation alone',
#'Therapeutic Combination_Radiation alone', 
'AJCC Stage (7th edition)_I': 'stage_I',
'AJCC Stage (7th edition)_II': 'stage_II', 
'AJCC Stage (7th edition)_III': 'stage_III',
'AJCC Stage (7th edition)_IV': 'stage_IV',
}
clinical_dummies['HNSCC'].rename(columns=name_mapping, inplace=True)

In [8]:
clinical_dummies['HNSCC'].to_pickle(data_path.joinpath('clinical_features_v7.pkl'))

In [38]:
clinical_dummies['HNSCC']

Unnamed: 0_level_0,age,prescribed_dose,primary_site,gender_female,gender_male,tcat_1,tcat_2,tcat_3,tcat_4,ncat_0,ncat_1,ncat_2,ncat_3,therapy_chemo_radio,therapy_radio
TCIA Radiomics dummy ID of To_Submit_Final,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HNSCC-01-0003,48,66.00,6,False,True,False,True,False,False,False,False,True,False,True,False
HNSCC-01-0024,80,66.00,6,True,False,False,False,True,False,False,False,True,False,False,True
HNSCC-01-0026,59,69.96,6,False,True,False,True,False,False,False,False,True,False,True,False
HNSCC-01-0030,63,69.96,6,False,True,False,False,False,True,False,True,False,False,True,False
HNSCC-01-0033,44,70.00,0,True,False,False,False,True,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HNSCC-01-0626,60,66.00,6,False,True,False,True,False,False,False,False,True,False,True,False
HNSCC-01-0627,65,70.00,6,False,True,True,False,False,False,False,False,True,False,False,True
HNSCC-01-0628,69,70.00,6,True,False,False,False,False,True,False,False,True,False,True,False
HNSCC-01-0629,63,70.00,6,True,False,False,False,False,True,False,False,True,False,True,False


In [37]:
len(clinical_dummies['HNSCC'].columns)

15

### UTSW clinical features

In [39]:
from hnc_project.pytorch.gen_params_torch_cfg import model_config
data_path = Path('../../data/UTSW_HNC')
patch_path = Path('../../data/UTSW_HNC/Nii_222_50_50_60_Crop')
dicom_path = Path('../../data/UTSW_HNC/dicom')
dicom_free_path = Path('../../data/UTSW_HNC/dicom_free')
clinical_info = pd.read_excel(data_path.joinpath('final_list_clinical.xlsx'))
clinical_info['IDA'] = clinical_info['IDA'].astype(str)
clinical_info.set_index('IDA', inplace=True)
#model = RunModel(model_config) 
#model.set_data(radiomics_dir=radiomics_directory, version='image_only_noaug')

In [40]:
features_to_pull = [
    #'Age',
    'Planned Dose (cGy)',
    'Gender',
    #'Laterality',
    'Primary site',
    'TNM Clinical T',
    'TNM Clinical N',
    #'TNM Clinical Stage Group',
    'RX Summary-Chemo',
    #'AJCC Stage (7th edition)',
    #'Therapeutic Combination',
]
categorical_features = [
    'Gender',
    #'Laterality',
    #'Primary site',
    'TNM Clinical T',
    'TNM Clinical N',
    'RX Summary-Chemo', 
    #'AJCC Stage (7th edition)',
    #'Therapeutic Combination',
]

pulled_features = clinical_info[features_to_pull].copy(deep=True)
treatment_start = pd.to_datetime(clinical_info['Treatment Start Date'], format='%m/%d/%Y')
age = {}

for pat in pulled_features.index:
    subsite = pulled_features.loc[pat, 'Primary site'].lower()
    if subsite in cancer_subsite_to_site_map.keys():
        site = cancer_subsite_to_site_map[subsite]
    else:
        site = subsite
    pulled_features.loc[pat, 'Primary site'] = cancer_site_map[site]
    tnm_t = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'TNM Clinical T'])))
    tnm_n = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'TNM Clinical N'])))
    rx_chemo = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'RX Summary-Chemo'])))
    #laterality = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'Laterality'])))
    
    if int(rx_chemo) == 0:
        rx_chemo_condense = 'radio'
    elif 0 < int(rx_chemo) <= 3:
        rx_chemo_condense = 'chemo_radio'
    else:
        rx_chemo_condense = 'radio'

    #if int(laterality) == 0:
    #    lat_condense = 'Midline'
    #elif int(laterality) == 1:
    #    lat_condense = 'R'
    #elif int(laterality) == 2:
    #    lat_condense = 'L'
    #else:
    #    lat_condense = 'NOS'
    try:    
        if int(tnm_t) == 88:
            tnm_t = 'NOS'
    except:
        tnm_t = 'NOS'
    try:
        if int(tnm_n) == 88:
            tnm_n = 'NOS'
    except:
        tnm_n = 'NOS'
        
    pulled_features.loc[pat, 'TNM Clinical T'] = tnm_t
    pulled_features.loc[pat, 'TNM Clinical N'] = tnm_n
    pulled_features.loc[pat, 'RX Summary-Chemo'] = rx_chemo_condense
    #pulled_features.loc[pat, 'Laterality'] = lat_condense
    pulled_features.loc[pat, 'Planned Dose (cGy)'] =  pulled_features.loc[pat, 'Planned Dose (cGy)'] / 100.

    try:
        meta_df = pd.read_excel(dicom_path.joinpath(pat).joinpath(f"patient.{pat}.xlsx"))
    except:
        meta_df = pd.read_excel(dicom_free_path.joinpath(pat).joinpath(f"patient.{pat}.xlsx"))

    birth_date = pd.to_datetime(meta_df['PatientBirthDate'][0], format='%Y%m%d')
    start_date = treatment_start.loc[pat]

    age[pat] = (start_date - birth_date) / pd.Timedelta("365 days")
    
pulled_features['Primary site'] = pulled_features['Primary site'].astype(int)
pulled_features['Age'] = pulled_features.index.map(age).astype(int)     
pulled_features[['TNM Clinical T', 'TNM Clinical N']] = pulled_features[['TNM Clinical T', 'TNM Clinical N']].astype(str)
clinical_dummies['UTSW'] = pd.get_dummies(pulled_features)
#model.data.clinical_features = pulled_features.loc[model.data.patients]
name_mapping = {
    'Age': 'age',
    'Planned Dose (cGy)': 'prescribed_dose',
    'Gender_Male': 'gender_male',
    'Gender_Female': 'gender_female',
    #'Laterality_L': 'laterality_l',
    'Laterality_R': 'laterality_r',
    'Laterality_Midline': 'laterality_m',
    'Laterality_NOS': 'laterality_nos',
    'Primary site': 'primary_site',
    #'Primary site_Base of tongue': 'primary_bot',
    #'Primary site_Larynx': 'primary_larynx',
    #'Primary site_Major salivary gland': 'primary_salivary',
    #'Primary site_Nasal cavity': 'primary_nasal_cavity',
    #'Primary site_Nasopharynx': 'primary_nasopharynx',
    #'Primary site_Tonsil': 'primary_tonsil',
    #'Primary site_Tonsillar pillar': 'primary_tonsil_pillar',
    'TNM Clinical T_0': 'tcat_0',
    'TNM Clinical T_1': 'tcat_1',
    'TNM Clinical T_2': 'tcat_2',
    'TNM Clinical T_3': 'tcat_3',
    'TNM Clinical T_4': 'tcat_4',
    'TNM Clinical T_NOS': 'tcat_nos',
    'TNM Clinical N_0': 'ncat_0',
    'TNM Clinical N_1': 'ncat_1',
    'TNM Clinical N_2': 'ncat_2',
    'TNM Clinical N_3': 'ncat_3',
    'TNM Clinical N_NOS': 'ncat_nos',
    'RX Summary-Chemo_chemo_radio': 'therapy_chemo_radio',
    'RX Summary-Chemo_nos': 'therapy_nos',
    'RX Summary-Chemo_radio': 'therapy_radio',
}
clinical_dummies['UTSW'].rename(columns=name_mapping, inplace=True)

In [41]:
clinical_dummies['UTSW']

Unnamed: 0_level_0,prescribed_dose,primary_site,age,gender_female,gender_male,tcat_0,tcat_1,tcat_2,tcat_3,tcat_4,tcat_nos,ncat_0,ncat_1,ncat_2,ncat_3,ncat_nos,therapy_chemo_radio,therapy_radio
IDA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
70539409,70.00,6,57,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False
72909995,69.96,6,55,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
91414847,29.68,5,50,False,True,False,False,False,True,False,False,False,False,False,True,False,True,False
93878339,66.00,6,53,False,True,False,False,True,False,False,False,False,False,True,False,False,True,False
70366494,69.30,6,50,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94804387,70.00,6,70,False,True,False,False,True,False,False,False,False,False,True,False,False,True,False
94993694,70.00,6,69,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False
95061731,70.00,6,51,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False
95396233,32.00,6,61,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False


### RADCURE Clinical Features

In [42]:
from hnc_project.pytorch.gen_params_torch_cfg import model_config
data_directory = '../../data/RADCURE'
patch_path = Path('../../data/RADCURE/Nii_222_50_50_60_Crop')
data_path = Path(data_directory)
clinical_info = pd.read_excel(data_path.joinpath('RADCURE-DA-CLINICAL-2.xlsx'))
clinical_info.set_index('patient_id', inplace=True)
#model = RunModel(model_config) 
#model.set_data(radiomics_dir=radiomics_directory, version='image_only_noaug')

In [43]:
features_to_pull = [
    'Age',
    'Dose',
    'Ds Site',
    'Sex',
    #'Tumor laterality',
    'T',
    'N',
    #'Stage',
    'Tx Modality',
    #'AJCC Stage (7th edition)',
    #'Smoking status',
    #'Therapeutic Combination',
]
categorical_features = [
    'Sex',
    #'Tumor laterality',
    #'Cancer subsite of origin',
    'T-category',
    'N-category',
    'AJCC Stage (7th edition)',
    'Tx Modality',
]

pulled_features = clinical_info[features_to_pull].copy(deep=True)
for pat in pulled_features.index:
    subsite = pulled_features.loc[pat, 'Ds Site'].lower()
    if subsite in cancer_subsite_to_site_map.keys():
        site = cancer_subsite_to_site_map[subsite]
    else:
        site = subsite
    pulled_features.loc[pat, 'Ds Site'] = cancer_site_map[site]
    ther_comb = pulled_features.loc[pat, 'Tx Modality'].lower()
    if 'chemo' in ther_comb:
        pulled_features.loc[pat, 'Tx Modality'] = 'chemo_radio'
    else:
        pulled_features.loc[pat, 'Tx Modality'] = 'radio'
        
    if pulled_features.loc[pat, 'T'] == 'nan':
        tnm_t = 'nos'
    elif pulled_features.loc[pat, 'T'] == 'Tis':
        tnm_t = '0'
    else:
        tnm_t = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'T'])))
        
    if pulled_features.loc[pat, 'N'] == 'nan':
        tnm_n = 'nos' 
    else:
        tnm_n = ''.join(filter(str.isdigit, str(pulled_features.loc[pat, 'N'])))

    if tnm_t == '':
        tnm_t = 'nos'
    if tnm_n == '':
        tnm_n = 'nos'
    pulled_features.loc[pat, 'T'] = tnm_t
    pulled_features.loc[pat, 'N'] = tnm_n
        
pulled_features[['T', 'N']] = pulled_features[['T', 'N']].astype(str)
pulled_features['Ds Site'] = pulled_features['Ds Site'].astype(int)

#pulled_features['N-category'] = pulled_features['N-category'].astype(str)
clinical_dummies['RADCURE'] = pd.get_dummies(pulled_features)
#model.data.clinical_features = pulled_features.loc[model.data.patients]
name_mapping = {
'Age at Diag': 'age',
'Age': 'age',
'Dose': 'prescribed_dose',
'Total prescribed Radiation treatment dose': 'prescribed_dose',
'Gender_Female': 'gender_female',
'Gender_Male': 'gender_male',
'Sex_Female': 'gender_female',
'Sex_Male': 'gender_male',
'Tumor laterality_L': 'laterality_l',
'Tumor laterality_Midline': 'laterality_m',
'Tumor laterality_NOS': 'laterality_nos',
'Tumor laterality_R': 'laterality_r',
'Cancer subsite of origin': 'primary_site',
'Ds Site': 'primary_site',
#'Cancer subsite of origin_Base of tongue': 'primary_bot',
#'Cancer subsite of origin_Glossopharyngeal sulcus': 'primary_gloss_sulc',
#'Cancer subsite of origin_NOS': 'primary_nos',
#'Cancer subsite of origin_Soft palate': 'primary_soft_palate',
#'Cancer subsite of origin_Tonsil': 'primary_tonsil',
'T_0': 'tcat_0',
'T_1': 'tcat_1',
'T_2': 'tcat_2',
'T_3': 'tcat_3',
'T_4': 'tcat_4',
'T_nos': 'tcat_nos',
'N_0': 'ncat_0',
'N_1': 'ncat_1',
'N_2': 'ncat_2',
'N_3': 'ncat_3',
'N_nos': 'ncat_nos',
'Therapeutic Combination_chemo_radio': 'therapy_chemo_radio',
'Therapeutic Combination_radio': 'therapy_radio',
'Tx Modality_chemo_radio': 'therapy_chemo_radio',
'Tx Modality_radio': 'therapy_radio',
#'Smoking status_Former': 'smoking_status_former',
#'Smoking status_Never': 'smoking_status_never',
#'Smoking status_Current': 'smoking_status_current',
#'Therapeutic Combination_Concurrent chemoradiotherapy',
#'Therapeutic Combination_Induction chemotherapy + concurrent chemoradiotherapy',
#'Therapeutic Combination_Induction chemotherapy+Radiation alone',
#'Therapeutic Combination_Radiation alone', 
#'AJCC Stage (7th edition)_I',
#'AJCC Stage (7th edition)_II', 
#'AJCC Stage (7th edition)_III',
#'AJCC Stage (7th edition)_IV',
}
clinical_dummies['RADCURE'].rename(columns=name_mapping, inplace=True)

In [44]:
clinical_dummies['RADCURE']

Unnamed: 0_level_0,age,prescribed_dose,primary_site,gender_female,gender_male,tcat_0,tcat_1,tcat_2,tcat_3,tcat_4,tcat_nos,ncat_0,ncat_1,ncat_2,ncat_3,ncat_nos,therapy_chemo_radio,therapy_radio
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
RADCURE-0005,62.6,60.0,6,True,False,False,False,False,False,True,False,False,False,True,False,False,False,True
RADCURE-0006,87.3,51.0,3,False,True,False,True,False,False,False,False,True,False,False,False,False,False,True
RADCURE-0007,49.9,64.0,6,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True
RADCURE-0009,72.3,70.0,0,False,True,True,False,False,False,False,False,False,False,True,False,False,False,True
RADCURE-0010,59.7,64.0,6,True,False,False,False,False,False,True,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RADCURE-4126,58.3,70.0,6,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True
RADCURE-4127,52.4,70.0,6,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False
RADCURE-4128,71.3,70.0,6,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True
RADCURE-4129,53.9,70.0,6,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False


In [53]:
#combined_clinical = pd.concat([clinical_dummies['HNSCC'], clinical_dummies['UTSW']])
combined_clinical = pd.concat(reversed([*clinical_dummies.values()]))
combined_clinical.fillna(False, inplace=True)
clinical_sorted = pd.concat([combined_clinical.iloc[:,:3], combined_clinical.iloc[:,3:].reindex(sorted(combined_clinical.columns[3:]), axis=1)], axis=1) 
#combined_clinical.to_pickle(data_path.joinpath('clinical_features_v2.pkl'))
clinical_sorted.to_pickle(data_path.joinpath('clinical_features_sorted_v9.pkl'))

In [54]:
combined_clinical

Unnamed: 0,age,prescribed_dose,primary_site,gender_female,gender_male,tcat_0,tcat_1,tcat_2,tcat_3,tcat_4,tcat_nos,ncat_0,ncat_1,ncat_2,ncat_3,ncat_nos,therapy_chemo_radio,therapy_radio
RADCURE-0005,62.6,60.0,6,True,False,False,False,False,False,True,False,False,False,True,False,False,False,True
RADCURE-0006,87.3,51.0,3,False,True,False,True,False,False,False,False,True,False,False,False,False,False,True
RADCURE-0007,49.9,64.0,6,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True
RADCURE-0009,72.3,70.0,0,False,True,True,False,False,False,False,False,False,False,True,False,False,False,True
RADCURE-0010,59.7,64.0,6,True,False,False,False,False,False,True,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HNSCC-01-0626,60.0,66.0,6,False,True,False,False,True,False,False,False,False,False,True,False,False,True,False
HNSCC-01-0627,65.0,70.0,6,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True
HNSCC-01-0628,69.0,70.0,6,True,False,False,False,False,False,True,False,False,False,True,False,False,True,False
HNSCC-01-0629,63.0,70.0,6,True,False,False,False,False,False,True,False,False,False,True,False,False,True,False


### ML on clinical only

In [5]:
gender_dummies = pd.get_dummies(model.data.clinical_features['Gender'])

In [6]:
test_y = gender_dummies.join(pd.DataFrame(model.data.y))

In [134]:
X_train, X_test, y_train, y_test = train_test_split(model.data.clinical_features, model.data.y, test_size=0.33, random_state=42, stratify=test_y)

In [8]:
sm = SMOTENC(categorical_features=categorical_features, sampling_strategy=1., random_state=42, k_neighbors=6)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [135]:
X_train = pd.get_dummies(X_train.astype({'T-category': 'str', 'N-category': 'str'}))
X_test = pd.get_dummies(X_test.astype({'T-category': 'str', 'N-category': 'str'}))

In [131]:
X_train

Unnamed: 0,Age at Diag,Total prescribed Radiation treatment dose,Gender_Female,Gender_Male,Tumor laterality_L,Tumor laterality_Midline,Tumor laterality_NOS,Tumor laterality_R,Cancer subsite of origin_Base of tongue,Cancer subsite of origin_Glossopharyngeal sulcus,...,N-category_2,N-category_3,AJCC Stage (7th edition)_I,AJCC Stage (7th edition)_II,AJCC Stage (7th edition)_III,AJCC Stage (7th edition)_IV,Therapeutic Combination_Concurrent chemoradiotherapy,Therapeutic Combination_Induction chemotherapy + concurrent chemoradiotherapy,Therapeutic Combination_Induction chemotherapy+Radiation alone,Therapeutic Combination_Radiation alone
0,0.180648,0.576291,False,True,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
1,0.303673,0.576291,True,False,False,False,False,True,True,False,...,True,False,False,False,False,True,False,True,False,False
2,-0.065403,0.576291,False,True,True,False,False,False,True,False,...,True,False,False,False,False,True,False,True,False,False
3,-1.541705,-1.117012,False,True,True,False,False,False,True,False,...,True,False,False,False,False,True,False,False,True,False
4,-2.033806,0.576291,False,True,True,False,False,False,False,False,...,True,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.549723,0.599173,False,True,True,False,False,False,True,False,...,True,False,False,False,False,True,True,False,False,False
502,-2.033806,0.607322,False,True,False,False,False,True,False,False,...,True,False,False,False,False,True,True,False,False,False
503,0.426698,-0.944838,False,True,True,False,False,False,True,False,...,True,False,False,False,False,True,False,True,False,False
504,-1.418680,0.625657,False,True,True,False,False,False,False,False,...,True,False,False,False,False,True,True,False,False,False


In [136]:
scaler = StandardScaler()
scaled = scaler.fit_transform(X_train[['Age at Diag', 'Total prescribed Radiation treatment dose']])
X_train['Age at Diag'] = scaled[:,0]
X_train['Total prescribed Radiation treatment dose'] = scaled[:,1]

X_test['Age at Diag'] = scaler.transform(X_test[['Age at Diag', 'Total prescribed Radiation treatment dose']])[:,0]
X_test['Total prescribed Radiation treatment dose'] = scaler.transform(X_test[['Age at Diag', 'Total prescribed Radiation treatment dose']])[:,0]

In [129]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=2)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
roc_auc = roc_auc_score(y_test.values, pred)
print(roc_auc)

0.40800000000000003


In [137]:
clf = svm.SVC(kernel='poly', degree=3, class_weight='balanced')
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
roc_auc = roc_auc_score(y_test.values, pred)
train_pred = clf.predict(X_train)
train_roc_auc = roc_auc_score(y_train.values, train_pred)

print(roc_auc)
print(train_roc_auc)


0.5925714285714285
0.9229249011857708


In [114]:
tree_clf = tree.DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_split=3, min_samples_leaf=2, class_weight='balanced')
tree_clf.fit(X_train, y_train)
pred = tree_clf.predict(X_test)
roc_auc = roc_auc_score(y_test.values, pred)
print(roc_auc)


0.5354285714285714


In [13]:
print(pred)

[ True  True False False  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True False
  True  True  True False  True  True  True  True  True  True  True  True
 False False False False  True False  True  True False  True  True  True
  True False  True  True  True False False  True  True False  True False
  True False  True  True  True  True  True False False  True  True  True
  True False False False  True False False False  True  True  True  True
  True  True  True  True  True False  True False False  True  True  True
  True False  True  True  True False  True False  True  True  True False
 False False False  True  True False  True False  True False  True  True
 False False False False  True  True False  True  True False  True False]
