In [None]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random

In [None]:
#PLOT RF FI FUNCTION I FOUND ONLINE
def plot_feature_importance(importance, names, model_type, name, threshold):
    
    #Create arrays from freature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #CREATE A DATAFRAME USING A DICTIONARY
    data = {'feature_names': feature_names, 'feature_importance': feature_importance}
    fi_df = pd.DataFrame(data)
    
    #SORT THE DF IN ORDER DECREASING FI
    fi_df.sort_values(by = ['feature_importance'], ascending = False, inplace = True)
    
    #filter
    fi_df = fi_df[fi_df['feature_importance'] >= threshold]
    
    #DEFINE SIZE OF BAR PLOT
    plt.figure(figsize = (5, 6))
    
    #PLOT SEABORN BAR CHART
    sns.barplot(x = fi_df['feature_importance'], y = fi_df['feature_names'])
    
    #ADD CHART LABELS
    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
    plt.savefig(name, bbox_inches='tight')

# Updated Data Set Up (From Geroge)

### Clinical Data

In [None]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [None]:
#READ IN ORIGINAL CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin = pd.read_csv("Homebase.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin = df_clin.rename(mapper = lower_no_space, axis = 1) 
df_clin.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin.head()

In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_lean = df_clin.drop(columns = [x for x in df_clin.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis', 't', 'n', 'm', 'b',\
                                        'predominant_lesion_type_at_diagnosis','lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_lean['lymph_node_biopsy_performed'] = \
df_clin_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['tumor_cell_cd30+'] = \
df_clin_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['large_cell_transformation'] = \
df_clin_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['tcr_clonality'] = \
df_clin_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

# Clinical Data updated_ Alex

In [None]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_updated.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [None]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis', 't', 'n', 'm', 'b',\
                                        'predominant_lesion_type_at_diagnosis','lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

### df_lean: Preprocessed Genetic Data

In [None]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv')

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                   ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))

# Merge (Original)

In [None]:
#MERGE CLINICAL, GENETIC DATA
df_all = pd.merge(df_lean, df_clin_lean, on='sample_id', how='left')

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis']:
        df_all[col] = df_all[col].fillna('unknown')
    elif col != 'sample_id':
        df_all[col] = df_all[col].fillna(np.mean(df_all[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all = pd.get_dummies(df_all, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis'])


# Merge (Updated)

In [None]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 
                                                          't', 'n', 'm', 'b'])


# Defining Features and Labels

In [None]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [None]:
# Define (Scaled/Normalized) Features and Labels
X = df_all.drop(columns = [x for x in df_all.columns if x == 'outcome' or x == 'sample_id'])
X_scaled = std_scl.fit_transform(X)
X_norm = normalize(X)

y = df_all.drop(columns = [x for x in df_all.columns if x != 'outcome'])

In [None]:
df_all['outcome'].value_counts(normalize = True)

## For updated data

In [None]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [None]:
# Define (Scaled/Normalized) Features and Labels
X_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled = std_scl.fit_transform(X_new)
X_new_norm = normalize(X_new)

y_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
df_all_updated['outcome'].value_counts(normalize = True)

# Model Training - (Original Data)

###  Repeated Stratified K-fold / Repeated K-fold

In [None]:
# 10X ITERATED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION FOR MOST ROBUST EVAL W/SMALL SAMPLE
rkf = RepeatedKFold(n_splits=3, n_repeats=10)

In [None]:
# Stratified Version
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

### Logistic Regression

In [None]:
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### Random Forest

In [None]:
# RANDOM FOREST (rkf)
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, X, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(rf, X, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

In [None]:
# RANDOM FOREST (rskf)
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rf, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### Ridge Classifier

In [None]:
#RIDGE
rdg = lm.RidgeClassifier()
acc_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('ridge accuracy: ', np.mean(acc_scores))
print('ridge precision: ', np.mean(prec_scores))

### Support Vector Classifier (SVC)

In [None]:
#Support Vector Machine
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

### XGBoost

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier

model = XGBClassifier()
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### CATBoost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations = 100, verbose=10,random_state=123)
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=123)
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

# Model Training - (Updated Data)

###  Repeated Stratified K-fold 

In [None]:
# Stratified Version
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

### Logistic Regression

In [None]:
log = lm.LogisticRegression()
acc_scores = cross_val_score(log, X_new_scaled, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(log, X_new_scaled, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### Random Forest

In [None]:
# RANDOM FOREST (rskf)
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### Ridge Classifier

In [None]:
#RIDGE
rdg = lm.RidgeClassifier()
acc_scores = cross_val_score(rdg, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rdg, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('ridge accuracy: ', np.mean(acc_scores))
print('ridge precision: ', np.mean(prec_scores))

### Support Vector Classifier (SVC)

In [None]:
#Support Vector Machine
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(svc, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

### XGBoost

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier

model = XGBClassifier()
acc_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))
print(model.best_score)

### CATBoost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations = 100, verbose = 2, random_state=123)
acc_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=123)
acc_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

# Hyperparameter Tunning

### AdaBoost

In [None]:
from sklearn.model_selection import GridSearchCV

# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
model = AdaBoostClassifier(random_state=123)

# Creating space for different hyperparameters
space = dict()
space['n_estimators'] = [10, 50, 100, 150, 200, 250, 300]
space['learning_rate'] = [10**i for i in range(-7,1)]
space['algorithm'] = ['SAMME', 'SAMME.R']

# Search
search = GridSearchCV(model, space, scoring='accuracy', cv=rskf, n_jobs=-1)
result = search.fit(X, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
model_ada = AdaBoostClassifier(n_estimators=200, learning_rate = 1, algorithm = "SAMME.R")
acc_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### Ridge Classifier

In [None]:
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
model_rdg = lm.RidgeClassifier()

# Creating space for different hyperparameters
space = dict()
space['alpha'] = [10**i for i in range(-4,3)]

# Search
search = GridSearchCV(model_rdg, space, scoring='accuracy', cv=rskf, n_jobs=-1)
result = search.fit(X_norm, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
model_rdg = lm.RidgeClassifier()

# Creating space for different hyperparameters
space = dict()
space['alpha'] = [a for a in np.arange(0.1, 1.1, 0.1)]

# Search
search = GridSearchCV(model_rdg, space, scoring='accuracy', cv=rskf, n_jobs=-1)
result = search.fit(X_norm, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
rdg = lm.RidgeClassifier(alpha = 0.7)
acc_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('alpha=0.7, ridge accuracy: ', np.mean(acc_scores))
print('alpha=0.7, ridge precision: ', np.mean(prec_scores))

### SVC

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
model_svc = SVC(random_state=123)

# Creating space for different hyperparameters
space = dict()
space['C'] = [50, 10, 1, 0.1, 0.001]
space['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
space['probability'] = [True, False]
space['gamma'] = ['auto', 'scale']

# Search
search = GridSearchCV(model_svc, space, scoring='accuracy', cv=rskf, n_jobs=-1)
result = search.fit(X_norm, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
svc = SVC(C=10, kernel='sigmoid', probability = True, gamma = 'scale')
    
acc_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('SVC accuracy: ', np.mean(acc_scores))
print('SVC precision: ', np.mean(prec_scores))

### Random Forest

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
model_rf = ens.RandomForestClassifier(random_state=123)

# Creating space for different hyperparameters
space = dict()
space['n_estimators'] = [50, 100, 150, 200, 300, 400, 500]
space['max_depth'] = [None, 5, 10, 14, 19, 25, 30]
space['criterion'] = ['gini', 'entropy']

# Search
search = GridSearchCV(model_rf, space, scoring='accuracy', cv=rskf, n_jobs=-1)
result = search.fit(X, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
rf = ens.RandomForestClassifier(n_estimators=300, criterion="gini")
acc_scores = cross_val_score(rf, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(rf, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

# Try PCA - Original data 

## BootStrap sampling


## standard deviation (Fro

In [None]:
## Try PCA for Dimensionality Reduction
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline

In [None]:
# Stratified Version
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

### Logistic Regression

In [None]:
steps = [('pca', PCA(n_components=3)), ('m', lm.LogisticRegression())]
model_lg = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_lg, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_lg, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(test_acc_scores))
print('precision: ', np.mean(test_prec_scores))

### Random Forest

In [None]:
# NOW RANDOM FOREST
AcRate = []
PreScore = []
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
for i in range(1,21):
    steps = [('pca', PCA(n_components=i)), ('m', ens.RandomForestClassifier(random_state=123))]
    model_rf = Pipeline(steps = steps)

    test_acc_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    test_prec_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    AcRate.append(np.mean(test_acc_scores))
    PreScore.append(np.mean(test_prec_scores))
    print('n_components:', i, 'accuracy: ', np.mean(test_acc_scores))
    print('n_components:', i, 'precision: ', np.mean(test_prec_scores))

In [None]:
n_components = [r for r in range (1,21)]
  
plt.plot(n_components, AcRate, label = "Accuracy")
plt.plot(n_components, PreScore, label = "Precision")
plt.legend()
plt.show()

In [None]:
steps = [('pca', PCA(n_components=11)), ('m', ens.RandomForestClassifier(n_estimators = 300))]
model_rf = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: 11, accuracy: ', np.mean(test_acc_scores))
print('n_components: 11, precision: ', np.mean(test_prec_scores))

### Ridge Classifier

In [None]:
#RIDGE
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=10)), ('m', lm.RidgeClassifier(alpha = 0.7))]
model_rid = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_rid, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_rid, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print(' ridge accuracy: ', np.mean(test_acc_scores))
print(' ridge precision: ', np.mean(test_prec_scores))

### SVC

In [None]:
steps = [('pca', PCA(n_components=10)), ('svc', SVC(C=10, kernel='sigmoid', probability = True, gamma = 'scale'))]
model_svc = Pipeline(steps = steps)

acc_scores = cross_val_score(model_svc, X_norm, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_svc, X_norm, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

### AdaBoost

In [None]:
AcRate = []
PreScore = []
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
for i in range (1,21):
    steps = [('pca', PCA(n_components=i)), ('ada', AdaBoostClassifier(n_estimators=200, learning_rate = 1, algorithm = "SAMME.R", random_state=123))]
    model_ada = Pipeline(steps = steps)

    acc_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    AcRate.append(np.mean(acc_scores))
    PreScore.append(np.mean(prec_scores))
    print('n_components:', i, 'accuracy: ', np.mean(acc_scores))
    print('n_components:', i, 'precision: ', np.mean(prec_scores))

In [None]:
import matplotlib.pyplot as plt
   
n_components = [r for r in range (1,21)]
  
plt.plot(n_components, AcRate, label = "Accuracy")
plt.plot(n_components, PreScore, label = "Precision")
plt.legend()
plt.show()

In [None]:
steps = [('pca', PCA(n_components=11)), ('ada', AdaBoostClassifier(n_estimators=200, learning_rate = 1, algorithm = "SAMME.R"))]
model_ada = Pipeline(steps = steps)

acc_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: accuracy: ', np.mean(acc_scores))
print('n_components: precision: ', np.mean(prec_scores))

### XGBoost

In [None]:
AcRate = []
PreScore = []
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
for i in range (1,21):
    steps = [('pca', PCA(n_components=i)), ('xg', XGBClassifier())]
    model_xg = Pipeline(steps = steps)

    acc_scores = cross_val_score(model_xg, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(model_xg, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    AcRate.append(np.mean(acc_scores))
    PreScore.append(np.mean(prec_scores))
    print('n_components:', i, 'accuracy: ', np.mean(acc_scores))
    print('n_components:', i, 'precision: ', np.mean(prec_scores))

In [None]:
n_components = [r for r in range (1,21)]
  
plt.plot(n_components, AcRate, label = "Accuracy")
plt.plot(n_components, PreScore, label = "Precision")
plt.legend()
plt.show()

In [None]:
#XGBoost
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=11)), ('xg', XGBClassifier())]
model_xg = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_xg, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_xg, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print(' XGBoost accuracy: ', np.mean(test_acc_scores))
print(' XGBoost precision: ', np.mean(test_prec_scores))

### CATBoost

In [None]:
#CATBoost
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=10)), ('cat', CatBoostClassifier(iterations = 100, verbose=10))]
model_cat = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print(' ridge accuracy: ', np.mean(test_acc_scores))
print(' ridge precision: ', np.mean(test_prec_scores))

In [None]:
AcRate = []
PreScore = []
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
for i in range (1,21):
    steps = [('pca', PCA(n_components=i)), ('cat', CatBoostClassifier(iterations = 100, verbose=10))]
    model_cat = Pipeline(steps = steps)

    acc_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
    prec_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
    AcRate.append(np.mean(acc_scores))
    PreScore.append(np.mean(prec_scores))
    print('n_components:', i, 'accuracy: ', np.mean(acc_scores))
    print('n_components:', i, 'precision: ', np.mean(prec_scores))

In [None]:
n_components = [r for r in range (1,21)]
  
plt.plot(n_components, AcRate, label = "Accuracy")
plt.plot(n_components, PreScore, label = "Precision")
plt.legend()
plt.show()

In [None]:
#CATBoost
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=10)), ('cat', CatBoostClassifier(iterations = 100, verbose=10))]
model_cat = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_cat, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print(' ridge accuracy: ', np.mean(test_acc_scores))
print(' ridge precision: ', np.mean(test_prec_scores))

# PCA then Hyperparameters Tuning - Updated Data

### Ada Boost

In [None]:
from sklearn.model_selection import GridSearchCV

#Ada Boost
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
steps = [('pca', PCA()), ('ada', AdaBoostClassifier(random_state=123))]
model_ada = Pipeline(steps = steps)

# Creating space for different hyperparameters
space = dict()
space['pca__n_components'] = [i for i in range(2, 21)]
space['ada__n_estimators'] = [10, 50, 100, 150, 200, 250, 300]
space['ada__learning_rate'] = [10**i for i in range(-7,1)]
space['ada__algorithm'] = ['SAMME', 'SAMME.R']

# Search
search = GridSearchCV(model_ada, space, scoring='accuracy', cv=rskf, n_jobs=-1, verbose = 2)
result = search.fit(X, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
#ada_result = pd.DataFrame(result.cv_results_)
#ada_result.head()

In [None]:
#Fit Again to confirm the Model Performance
steps = [('pca', PCA(n_components=10)), ('ada', AdaBoostClassifier(n_estimators=250, learning_rate = 1, algorithm = "SAMME"))]
model_ada = Pipeline(steps = steps)

acc_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: accuracy: ', np.mean(acc_scores))
print('n_components: precision: ', np.mean(prec_scores))

### Random Forest

In [None]:
#Random Forest
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
steps = [('pca', PCA(n_components=i)), ('rf', ens.RandomForestClassifier(random_state=123))]
model_rf = Pipeline(steps = steps)

# Creating space for different hyperparameters
space = dict()
space['pca__n_components'] = [i for i in range(2, 21)]
space['rf__n_estimators'] = [50, 100, 150, 200, 300, 400, 500]
space['rf__max_depth'] = [None, 5, 10, 14, 19, 25, 30]
space['rf__criterion'] = ['gini', 'entropy']

# Search
search = GridSearchCV(model_rf, space, scoring='accuracy', cv=rskf, n_jobs=-1, verbose = 2)
result = search.fit(X, y.values.ravel())

#Print Result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
#rf_result = pd.DataFrame(result.cv_results_)
#rf_result.head()

In [None]:
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 50, max_depth = 19, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_rf, X, y.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: 11, accuracy: ', np.mean(test_acc_scores))
print('n_components: 11, precision: ', np.mean(test_prec_scores))

### Ridge Classifier

# Hyperparameters Tuning - Updated Data

### AdaBoost

In [None]:
from sklearn.model_selection import GridSearchCV

#Ada Boost
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
steps = [('pca', PCA()), ('ada', AdaBoostClassifier(random_state=123))]
model_ada = Pipeline(steps = steps)

# Creating space for different hyperparameters
space = dict()
space['pca__n_components'] = [i for i in range(2, 21)]
space['ada__n_estimators'] = [10, 50, 100, 150, 200, 250, 300]
space['ada__learning_rate'] = [10**i for i in range(-7,1)]
space['ada__algorithm'] = ['SAMME', 'SAMME.R']

# Search
ada_search = GridSearchCV(model_ada, space, scoring='accuracy', cv=rskf, n_jobs=-1, verbose = 2)
ada_result = ada_search.fit(X_new, y_new.values.ravel())

#Print Result
print('Best Score: %s' % ada_result.best_score_)
print('Best Hyperparameters: %s' % ada_result.best_params_)

In [None]:
#Print Result
print('Best Score: %s' % ada_result.best_score_)
print('Best Hyperparameters: %s' % ada_result.best_params_)

In [None]:
ada_result_tb = pd.DataFrame(ada_result.cv_results_)
ada_result_tb.head()

In [None]:
#Fit Again Using the best hyperparameters to confirm the Model Performance
steps = [('pca', PCA(n_components=15)), ('ada', AdaBoostClassifier(n_estimators=300, learning_rate = 0.1, algorithm = "SAMME.R"))]
model_ada = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=20)
acc_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: accuracy: ', np.mean(acc_scores))
print('n_components: precision: ', np.mean(prec_scores))

In [None]:
#Fit Again Using the best hyperparameters to confirm the Model Performance
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=100)
steps = [('pca', PCA(n_components=15)), ('ada', AdaBoostClassifier(n_estimators=300, learning_rate = 0.1, algorithm = "SAMME.R"))]
model_ada = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=20)
acc_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components 15: accuracy: ', np.mean(acc_scores))
print('n_components 15: precision: ', np.mean(prec_scores))
print('n_components: 15, accuracy_std: ', np.std(test_acc_scores))
print('n_components: 15, precision_std: ', np.std(test_prec_scores))


### Random Forest

In [None]:
#Random Forest
# 10X ITERATED STRATIFIED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
steps = [('pca', PCA(n_components=i)), ('rf', ens.RandomForestClassifier(random_state=123))]
model_rf = Pipeline(steps = steps)

# Creating space for different hyperparameters
space = dict()
space['pca__n_components'] = [i for i in range(2, 21)]
space['rf__n_estimators'] = [50, 100, 150, 200, 300, 400, 500]
space['rf__max_depth'] = [None, 5, 10, 14, 19, 25, 30]
space['rf__criterion'] = ['gini', 'entropy']

# Search
rf_search = GridSearchCV(model_rf, space, scoring='accuracy', cv=rskf, n_jobs=-1, verbose = 2)
rf_result = search.fit(X_new, y_new.values.ravel())

#Print Result
print('Best Score: %s' % rf_result.best_score_)
print('Best Hyperparameters: %s' % rf_result.best_params_)

In [None]:
print('Best Score: %s' % rf_result.best_score_)
print('Best Hyperparameters: %s' % rf_result.best_params_)

In [None]:
rf_result_tb = pd.DataFrame(result.cv_results_)
rf_result_tb.head()

In [None]:
#Fit Again using the best hyperparameters to confirm the Model Performance
steps = [('pca', PCA(n_components=14)), ('m', ens.RandomForestClassifier(n_estimators = 50, max_depth = 14, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: 14, accuracy: ', np.mean(test_acc_scores))
print('n_components: 14, precision: ', np.mean(test_prec_scores))

In [None]:
#Fit Again using the best hyperparameters to confirm the Model Performance
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=0)

steps = [('pca', PCA(n_components=14)), ('m', ens.RandomForestClassifier(n_estimators = 50, max_depth = 14, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

test_acc_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
test_prec_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('n_components: 14, accuracy: ', np.mean(test_acc_scores))
print('n_components: 14, precision: ', np.mean(test_prec_scores))
print('n_components: 14, accuracy_std: ', np.std(test_acc_scores))
print('n_components: 14, precision_std: ', np.std(test_prec_scores))

In [None]:
print('n_components: 14, accuracy_std: ', np.percentile(test_acc_scores, 20))
print('n_components: 14, precision_std: ', np.percentile(test_prec_scores, 20))

# Feature Importance

In [None]:
# Dropping Gene
df_sect_only = df_all.copy()
df_sect_only = df_sect_only.drop(columns = [x for x in df_sect_only.columns if 'gene_'  in x])

In [None]:
# SIMILARLY, WANT MORE ROBUST IDEA OF RF FI; GOING TO ITERATRE THRU 10K TIMES
X_sect = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x == 'outcome' or x == 'sample_id'])

y_sect = df_sect_only.drop(columns = [x for x in df_sect_only.columns if x != 'outcome'])
final_df = pd.DataFrame()

for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X_sect, y_sect, 
                                                        stratify = y_sect, 
                                                        test_size = 0.2)

    model_ada = AdaBoostClassifier(n_estimators=200, learning_rate = 1, algorithm = "SAMME.R").fit(X_train, y_train.values.ravel())
    
    feature_names = [x for x in df_sect_only.columns if x != 'outcome' and x != 'sample_id']
    importances = model_ada.feature_importances_
    data = {'feature_names': feature_names, 'feature_importance': importances}
    fi_df = pd.DataFrame(data)
    
    final_df = pd.concat([final_df, fi_df])
    

In [None]:
keep_df = final_df.copy()

keep_df = keep_df.groupby('feature_names').sum().reset_index()

# CONVERT TO AVERAGE
keep_df['feature_importance'] = keep_df['feature_importance'].apply(lambda x: x/10000)

keep_df.sort_values(by = ['feature_importance'], ascending = False, inplace = True)

#keep_df.to_csv("ada_fi_tmp.csv")

In [None]:
keep_df = pd.read_csv("ada_fi_tmp.csv")
plt.figure(figsize = (5, 7))
plt.scatter(keep_df.head(25)['feature_importance'], keep_df.head(25)['feature_names'])
# plt.tick_params(color='white', labelcolor='white')
plt.title("RF Feature Importance Avg. - 10,000 Iterations")#, color = 'white')
plt.savefig("fi_10k", bbox_inches='tight')