In [None]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")


# Clinical Data updated_ Alex

In [None]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [None]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_new_updated.csv", header = 1)

In [None]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [None]:
#Due to the abnormal in date of birth from the Stanford data, 
#Remove the age at initial diagonosis for data from Stanford & the one that has negative age 
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated['data_access_group'] == 'Stanford', np.nan, df_clin_updated["age_at_initial_diagnosis"])
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated["age_at_initial_diagnosis"] < 0, np.nan, df_clin_updated["age_at_initial_diagnosis"])


In [None]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [None]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis',\
                                        'lymph_node_biopsy_performed','predominant_lesion_type_at_diagnosis',\
                                        'family_history_of_leukemia_lymphoma', 't', 'n', 'm', 'b',\
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

# 'predominant_lesion_type_at_diagnosis', 
# 't', 'n', 'm', 'b', 
# 'eosinophils,_absolute_103_μl'


#wbc_103mul
#rbc_106ml
#nuetrophil_number

In [None]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

### df_lean: Preprocessed Genetic Data

In [None]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv')

In [None]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

In [None]:
#TRANSFORM SAMPLE ID TOJOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                  (x[:-13] if 'almeida' in x else\
                                                  ((x[-2:]+x[:-2])[:-15] if 'ungewickell' in x else\
                                                  ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))))

# Merge (Updated)

In [None]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [None]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [None]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 
                                                          'predominant_lesion_type_at_diagnosis', 't', 'n', 'm', 'b'])


In [None]:
df_all_updated['sample_id'].unique()

In [None]:
df_lean.shape

# Defining Features and Labels - For updated data

## Yale

In [None]:
X_train_y = df_all_updated[df_all_updated['sample_id'].str.contains("CTCL")== False]
X_test_y = df_all_updated[df_all_updated['sample_id'].str.contains("CTCL")== True]

In [None]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [None]:
# Define (Scaled/Normalized) Features and Labels
X_new_y = X_train_y.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_y = std_scl.fit_transform(X_new_y)
X_new_norm_y = normalize(X_new_y)

y_new_y = X_train_y.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
X_test_yale = X_test_y.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_y = std_scl.transform(X_test_yale)
X_test_norm_y = normalize(X_test_yale)

y_test_yale = X_test_y.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
X_train_y['outcome'].value_counts(normalize = True)

In [None]:
X_test_y['outcome'].value_counts(normalize = True)

## IMM - Barcelona

In [None]:
X_train_b = df_all_updated[df_all_updated['sample_id'].str.contains("prasad")== False]
X_test_b = df_all_updated[df_all_updated['sample_id'].str.contains("prasad")== True]

# Define (Scaled/Normalized) Features and Labels
X_new_b = X_train_b.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_b = std_scl.fit_transform(X_new_b)
X_new_norm_b = normalize(X_new_b)
y_new_b = X_train_b.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])


X_test_bar = X_test_b.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_bar = std_scl.transform(X_test_bar)
X_test_norm_bar = normalize(X_test_bar)
y_test_bar = X_test_b.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
X_train_b['outcome'].value_counts(normalize = True)

In [None]:
X_test_b['outcome'].value_counts(normalize = True)

## Kings College

In [None]:
X_train_k = df_all_updated[df_all_updated['sample_id'].str.contains("WES")== False]
X_test_k = df_all_updated[df_all_updated['sample_id'].str.contains("WES")== True]

# Define (Scaled/Normalized) Features and Labels
X_new_k = X_train_k.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_k = std_scl.fit_transform(X_new_k)
X_new_norm_k = normalize(X_new_k)
y_new_k = X_train_k.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])


X_test_kings = X_test_k.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_kings = std_scl.transform(X_test_kings)
X_test_norm_kings = normalize(X_test_kings)
y_test_kings = X_test_k.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [None]:
X_train_k['outcome'].value_counts(normalize = True)

In [None]:
X_test_k['outcome'].value_counts(normalize = True)

In [None]:
print(len(y_test_kings))
print(len(y_test_bar))
print(len(y_test_yale))

# Evaluate

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

### AdaBoost

#### Yale

In [None]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_y, y_new_y)
y_pred = model_ada.predict(X_test_yale)

ada_acc_scores = accuracy_score(y_test_yale, y_pred)
ada_prec_scores = precision_score(y_test_yale, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)


In [None]:
y_pred

In [None]:
np.ravel(y_test_yale)

In [None]:
print(len(y_test_yale))

#### Barcelona

In [None]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_b, y_new_b)
y_pred = model_ada.predict(X_test_bar)

ada_acc_scores = accuracy_score(y_test_bar, y_pred)
ada_prec_scores = precision_score(y_test_bar, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)

#### Kings College

In [None]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_k, y_new_k)
y_pred = model_ada.predict(X_test_kings)

ada_acc_scores = accuracy_score(y_test_kings, y_pred)
ada_prec_scores = precision_score(y_test_kings, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)

### Ramdom Forest

#### Yale

In [None]:
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_y, y_new_y)
y_pred = model_rf.predict(X_test_yale)

rf_acc_scores = accuracy_score(y_test_yale, y_pred)
rf_prec_scores = precision_score(y_test_yale, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

In [None]:
y_pred

In [None]:
np.ravel(y_test_yale)

#### Barcelona

In [None]:
# RF
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_b, y_new_b)
y_pred = model_rf.predict(X_test_bar)

rf_acc_scores = accuracy_score(y_test_bar, y_pred)
rf_prec_scores = precision_score(y_test_bar, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

#### Kings College

In [None]:
# RF
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_k, y_new_k)
y_pred = model_rf.predict(X_test_kings)

rf_acc_scores = accuracy_score(y_test_kings, y_pred)
rf_prec_scores = precision_score(y_test_kings, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

### XGBoost

#### Yale

In [None]:
#XGBoost
from numpy import loadtxt
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_y, y_new_y)
y_pred = model_xg.predict(X_test_yale)

xg_acc_scores = accuracy_score(y_test_yale, y_pred)
xg_prec_scores = precision_score(y_test_yale, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)

In [None]:
y_pred

In [None]:
np.ravel(y_test_yale)

#### Barcelona

In [None]:
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_b, y_new_b)
y_pred = model_xg.predict(X_test_bar)

xg_acc_scores = accuracy_score(y_test_bar, y_pred)
xg_prec_scores = precision_score(y_test_bar, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)

#### Kings College

In [None]:
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_k, y_new_k)
y_pred = model_xg.predict(X_test_kings)

xg_acc_scores = accuracy_score(y_test_kings, y_pred)
xg_prec_scores = precision_score(y_test_kings, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)