In [None]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random

# Preprocessing - from George's code

In [None]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [None]:
df_lean = pd.read_csv ('stats_by_sample.csv')

In [None]:
df_lean

In [None]:
df_clin = pd.read_csv("Homebase.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin = df_clin.rename(mapper = lower_no_space, axis = 1) 
df_clin.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
df_clin

In [None]:
#TRANSFORM SAMPLE ID TO JOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                   ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))



In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_lean = df_clin.drop(columns = [x for x in df_clin.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis', 't', 'n', 'm', 'b',\
                                        'predominant_lesion_type_at_diagnosis','lymph_node_biopsy_performed',\
                                        'family_history_of_leukemia_lymphoma', \
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])
                                       
                                       

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_lean['lymph_node_biopsy_performed'] = \
df_clin_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['tumor_cell_cd30+'] = \
df_clin_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['large_cell_transformation'] = \
df_clin_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_lean['tcr_clonality'] = \
df_clin_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
#MERGE CLINICAL, GENETIC DATA
df_all = pd.merge(df_lean, df_clin_lean, on='sample_id', how='left')

In [None]:
df_all

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis']:
        df_all[col] = df_all[col].fillna('unknown')
    elif col != 'sample_id':
        df_all[col] = df_all[col].fillna(np.mean(df_all[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all = pd.get_dummies(df_all, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis'])

#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

## Logistic Regression - From George

In [None]:
X = df_all.drop(columns = [x for x in df_all.columns if x == 'outcome' or x == 'sample_id'])
X_scaled = std_scl.fit_transform(X)
X_norm = normalize(X)

y = df_all.drop(columns = [x for x in df_all.columns if x != 'outcome'])

In [None]:
y.value_counts(normalize=True)

In [None]:
# 10X ITERATED 3-FOLD CROSS-VALIDATED ACCURACY AND PRECISION FOR MOST ROBUST EVAL W/SMALL SAMPLE
rkf = RepeatedKFold(n_splits=3, n_repeats=10)

log = lm.LogisticRegression()
acc_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(log, X_scaled, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

## Random Forest

In [None]:
# NOW RANDOM FOREST
rf = ens.RandomForestClassifier()
acc_scores = cross_val_score(rf, X, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(rf, X, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

## Ridge Regression

In [None]:
#RIDGE
for a in np.arange(1, 102, 10):
    rdg = lm.RidgeClassifier(alpha = a)
    acc_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(rdg, X_norm, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(a, ' ridge accuracy: ', np.mean(acc_scores))
    print(a, ' ridge precision: ', np.mean(prec_scores))
    


## SVM

In [None]:
#Support Vector Machine
for kern in ['linear', 'poly', 'rbf', 'sigmoid']:
    
    svc = SVC(kernel = kern, probability = True)
    
    acc_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
    prec_scores = cross_val_score(svc, X_norm, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
    print(kern, ' accuracy: ', np.mean(acc_scores))
    print(kern, ' precision: ', np.mean(prec_scores))

## XGBoost

In [None]:
#XG Boost
from numpy import loadtxt
from xgboost import XGBClassifier

model = XGBClassifier()
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

## ADABoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100, random_state=0)
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))

## CATBoost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations = 100, verbose=10,random_state=123)
acc_scores = cross_val_score(model, X, y.values.ravel(), scoring='accuracy', cv=rkf, n_jobs=-1)
prec_scores = cross_val_score(model, X, y.values.ravel(), scoring='precision', cv=rkf, n_jobs=-1)
print('accuracy: ', np.mean(acc_scores))
print('precision: ', np.mean(prec_scores))