In [None]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")


# Clinical Data updated_ Alex

In [None]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [None]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_new_updated.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [None]:
#Due to the abnormal in date of birth from the Stanford data, 
#Remove the age at initial diagonosis for data from Stanford & the one that has negative age 
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated['data_access_group'] == 'Stanford', np.nan, df_clin_updated["age_at_initial_diagnosis"])
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated["age_at_initial_diagnosis"] < 0, np.nan, df_clin_updated["age_at_initial_diagnosis"])


In [None]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis',\
                                        'lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

#'predominant_lesion_type_at_diagnosis', 't', 'n', 'm', 'b', 'eosinophils,_absolute_103_μl'

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

### df_lean: Preprocessed Genetic Data

In [None]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv')

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                  (x[:-13] if 'almeida' in x else\
                                                  ((x[-2:]+x[:-2])[:-15] if 'ungewickell' in x else\
                                                  ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))))

# Merge (Updated)

In [None]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity'])


# Defining Features and Labels - For updated data

In [None]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [None]:
# Define (Scaled/Normalized) Features and Labels
X_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled = std_scl.fit_transform(X_new)
X_new_norm = normalize(X_new)

y_new = df_all_updated.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
df_all_updated['outcome'].value_counts(normalize = True)

# Model Training without Lesion Type

### Repeated Stratified K-fold

In [None]:
# Stratified Version
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)

### Logistic Regression

In [None]:
# Log
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=2)), ('m', lm.LogisticRegression(solver = 'liblinear', penalty = 'l1'))]
model_log = Pipeline(steps = steps)

acc_scores = cross_val_score(model_log, X_new_scaled, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_log, X_new_scaled, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Logistic Regression")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### Ramdom Forest

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

acc_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_rf, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Random Forest")
print('n_components: 12, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 12, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### Support Vector Machine

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=5)), ('svc', SVC(C = 50, gamma = 'auto', kernel = 'linear', probability = True))]
model_svc = Pipeline(steps = steps)

acc_scores = cross_val_score(model_svc, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_svc, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("SVC")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### Ridge Classifier

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=5)), ('m', lm.RidgeClassifier(alpha = 0.1))]
model_rdg = Pipeline(steps = steps)

acc_scores = cross_val_score(model_rdg, X_new_norm, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_rdg, X_new_norm, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("Ridge Classifier")
print('n_components: 2, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 2, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

acc_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_ada, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("AdaBoost")
print('n_components: 12, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 12, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### XGBoost

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
model_xg = XGBClassifier(eta = 0.1, max_depth = 6, scale_pos_weight = 1, eval_metric = "error", use_label_encoder = False)

acc_scores = cross_val_score(model_xg, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_xg, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print("XGBoost")
print('n_components: 13, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 13, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
print("XGBoost")
print('n_components: 13, accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('n_components: 13, precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

### CATBoost

In [None]:
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10)
steps = [('pca', PCA(n_components=12)), ('cat', CatBoostClassifier(iterations = 250))]
model_cat = Pipeline(steps = steps)

acc_scores = cross_val_score(model_cat, X_new, y_new.values.ravel(), scoring='accuracy', cv=rskf, n_jobs=-1)
prec_scores = cross_val_score(model_cat, X_new, y_new.values.ravel(), scoring='precision', cv=rskf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))

In [None]:
print('CATBoost:')
print('accuracy: ', np.mean(acc_scores))
print('std for accuracy: ', np.std(acc_scores))
print('precision: ', np.mean(prec_scores))
print('std for precision: ', np.std(prec_scores))