# ML-diagnosis-of-esophageal-cancer
## Supervised Machine Learning Model Selection
Authors: Robert Franklin

Date: 2023-03-13

In [11]:
# Dependencies & Installs
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [12]:
# read CB_df_cleaned.csv file
CB_df = pd.read_csv('../Data_Cleaned/CB_df_cleaned.csv')
# read PN_df_cleaned.csv file
PN_df = pd.read_csv('../Data_Cleaned/PN_df_cleaned.csv')
# read Joined_df_cleaned.csv file
joined_df = pd.read_csv('../Data_Cleaned/Joined_df_cleaned.csv')
joined_df

Unnamed: 0.1,Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M
0,0,EAC,231901.565681,531675.915634,130229.593177,4.574278e+07,7.427876e+07,1.399114e+06,62766.439036,8.785951e+06,...,3.138630,6.545236,2.455497,1.362097,3.987969,65.497680,63.0,31.46,0,1
1,1,BE-ID,225768.925899,392680.876091,147433.340040,4.629934e+07,3.240003e+07,1.193564e+06,56995.901230,8.329383e+06,...,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,69.0,41.40,1,0
2,2,NSE,317408.869810,624661.826663,166048.185854,5.879864e+07,3.151861e+07,2.006994e+06,66677.865179,8.918962e+06,...,1.858580,11.655492,4.471990,0.821500,2.668353,72.455779,54.0,30.04,1,0
3,3,BE,196448.722270,383654.430349,81930.599409,4.418331e+07,1.274380e+07,1.121787e+06,47417.392161,6.726070e+06,...,1.179382,6.028578,3.230195,0.731139,1.341246,68.811932,55.0,30.26,0,1
4,4,BE,513410.263339,665023.074428,168785.312885,8.292568e+07,3.314564e+07,2.811941e+06,96794.597616,1.307532e+07,...,2.745901,10.891096,9.083470,1.226545,4.634702,54.474587,68.0,30.40,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,262,BE-HGD,212909.415400,409214.656100,46791.487300,1.044341e+05,1.897595e+07,1.341625e+06,109005.583000,2.811791e+05,...,1.105802,4.535125,1.143341,0.667606,1.862710,84.361363,71.0,30.60,0,1
263,263,BE-LGD,264552.337400,443446.696400,53288.416120,1.013776e+05,1.906240e+07,1.276686e+06,161909.319100,4.063519e+05,...,0.826197,2.938648,1.569241,0.477460,2.483334,61.603283,76.0,27.20,0,1
264,264,BE,311888.143400,779222.794900,59896.271210,1.419347e+05,3.572879e+07,1.880594e+06,167653.314800,6.303756e+05,...,1.085453,3.612338,1.061732,0.564546,1.251242,79.354873,49.0,35.60,0,1
265,265,BE-LGD,262882.689100,317187.906000,38295.881410,1.148620e+05,8.537090e+06,1.429449e+06,134570.142600,3.732240e+05,...,0.618220,7.036068,1.250521,0.403141,1.231198,67.244436,49.0,0.00,0,1


In [13]:
# Drop the Unnamed: 0 column from joined_df using iloc
joined_df = joined_df.iloc[:, 1:]
joined_df

Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,...,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M
0,EAC,231901.565681,531675.915634,130229.593177,4.574278e+07,7.427876e+07,1.399114e+06,62766.439036,8.785951e+06,1.865353e+06,...,3.138630,6.545236,2.455497,1.362097,3.987969,65.497680,63.0,31.46,0,1
1,BE-ID,225768.925899,392680.876091,147433.340040,4.629934e+07,3.240003e+07,1.193564e+06,56995.901230,8.329383e+06,2.990847e+06,...,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,69.0,41.40,1,0
2,NSE,317408.869810,624661.826663,166048.185854,5.879864e+07,3.151861e+07,2.006994e+06,66677.865179,8.918962e+06,3.406724e+06,...,1.858580,11.655492,4.471990,0.821500,2.668353,72.455779,54.0,30.04,1,0
3,BE,196448.722270,383654.430349,81930.599409,4.418331e+07,1.274380e+07,1.121787e+06,47417.392161,6.726070e+06,1.723110e+06,...,1.179382,6.028578,3.230195,0.731139,1.341246,68.811932,55.0,30.26,0,1
4,BE,513410.263339,665023.074428,168785.312885,8.292568e+07,3.314564e+07,2.811941e+06,96794.597616,1.307532e+07,4.179410e+06,...,2.745901,10.891096,9.083470,1.226545,4.634702,54.474587,68.0,30.40,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,BE-HGD,212909.415400,409214.656100,46791.487300,1.044341e+05,1.897595e+07,1.341625e+06,109005.583000,2.811791e+05,1.907612e+05,...,1.105802,4.535125,1.143341,0.667606,1.862710,84.361363,71.0,30.60,0,1
263,BE-LGD,264552.337400,443446.696400,53288.416120,1.013776e+05,1.906240e+07,1.276686e+06,161909.319100,4.063519e+05,2.734662e+05,...,0.826197,2.938648,1.569241,0.477460,2.483334,61.603283,76.0,27.20,0,1
264,BE,311888.143400,779222.794900,59896.271210,1.419347e+05,3.572879e+07,1.880594e+06,167653.314800,6.303756e+05,1.635644e+05,...,1.085453,3.612338,1.061732,0.564546,1.251242,79.354873,49.0,35.60,0,1
265,BE-LGD,262882.689100,317187.906000,38295.881410,1.148620e+05,8.537090e+06,1.429449e+06,134570.142600,3.732240e+05,1.652906e+05,...,0.618220,7.036068,1.250521,0.403141,1.231198,67.244436,49.0,0.00,0,1


In [14]:
# Create a StandardScaler object
scaler = StandardScaler()

# Select only the columns that need to be scaled
columns_to_scale = joined_df.columns[1:]

# Scale the selected columns
scaled_columns = scaler.fit_transform(joined_df[columns_to_scale])

# Create a new dataframe with the scaled columns
scaled_df = pd.concat([joined_df['Patient Group'], pd.DataFrame(scaled_columns, columns=columns_to_scale)], axis=1)

# Create a new column that maps the Group column values to either BE-HGD or EAC
scaled_df['target'] = joined_df['Patient Group'].map({'BE-HGD': 0, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 0})

# Drop the Group column as it is no longer needed
scaled_df.drop('Patient Group', axis=1, inplace=True)

scaled_df

Unnamed: 0,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,P01009_AAL,...,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M,target
0,-0.024573,0.079047,2.938970,2.790700,5.116219,0.017924,-1.115308,3.617295,2.253224,0.620840,...,1.879006,1.183214,3.632176,3.142403,-1.074068,0.071812,0.466777,-0.567258,0.567258,1
1,-0.106926,-0.689741,3.664605,2.830143,1.096909,-0.522890,-1.207908,3.395444,4.061588,-0.279418,...,1.908998,3.587443,0.875417,1.131047,-0.787588,0.563451,1.953061,1.762865,-1.762865,0
2,1.123684,0.593356,4.449759,3.715939,1.012314,1.617289,-1.052541,3.681927,4.729788,-1.022580,...,5.685709,3.505248,1.077397,1.197052,-0.293693,-0.665645,0.254451,1.762865,-1.762865,0
3,-0.500660,-0.739666,0.901771,2.680184,-0.789597,-0.711740,-1.361616,2.616377,2.024679,-1.043253,...,1.494141,2.075296,0.650366,-0.759341,-0.702363,-0.583706,0.287347,-0.567258,0.567258,0
4,3.755740,0.816596,4.565208,5.425767,1.168469,3.735150,-0.569255,5.701546,5.971285,-0.167516,...,5.116300,8.815465,2.991576,4.095803,-2.310346,0.481511,0.308280,-0.567258,0.567258,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,-0.279614,-0.598292,-0.580358,-0.443584,-0.191466,-0.133333,-0.373304,-0.515265,-0.437390,-0.140240,...,0.381646,-0.327761,0.350114,0.009391,1.041559,0.727330,0.338185,-0.567258,0.567258,0
263,0.413887,-0.408953,-0.306325,-0.443801,-0.183170,-0.304192,0.475648,-0.454442,-0.304505,0.289013,...,-0.807592,0.162672,-0.548486,0.924302,-1.510838,1.137029,-0.170201,-0.567258,0.567258,0
264,1.049548,1.448240,-0.027613,-0.440927,1.416387,1.284723,0.567823,-0.345587,-0.481088,-0.291257,...,-0.305751,-0.421736,-0.136928,-0.892022,0.480064,-1.075344,1.085813,-0.567258,0.567258,0
265,0.391465,-1.107296,-0.938694,-0.442845,-1.193336,0.097735,0.036934,-0.470540,-0.478314,-0.879517,...,2.244634,-0.204341,-0.899704,-0.921570,-0.878163,-1.075344,-4.237295,-0.567258,0.567258,0


## Logistic Regression Model

In [15]:
warnings.filterwarnings('ignore')

# Split the dataset into training and testing sets
X = scaled_df.drop(columns=['target'])
y = scaled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 500, 1000]
}

# Create a Logistic Regression model
model = LogisticRegression()

# Use GridSearchCV to search for the best hyperparameters
grid = GridSearchCV(model, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

# Print the best hyperparameters
print('Best Hyperparameters:', grid.best_params_)

# Train the model on the training data with the best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(best_model, "Model_Saved/model_rf_LogisticRegression.pkl")

Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 79.63%


['Model_Saved/model_rf_LogisticRegression.pkl']

## Decision Tree Classifier Model

In [None]:
# Split the dataset into training and testing sets
X = scaled_df.drop(columns=['target'])
y = scaled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier model
model = DecisionTreeClassifier()

# Set up a parameter grid to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 15),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 10),
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform a Randomized Search over the parameter grid
search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=5, random_state=42)
search.fit(X_train, y_train)

# Get the best parameters and model
best_params = search.best_params_
best_model = search.best_estimator_

# Train the model on the training data
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Print out the best parameters
print("Best parameters:", search.best_params_)

## Random Forrest Classifier Model

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create a random forest model
model = RandomForestClassifier()

# Create a GridSearchCV object
search = GridSearchCV(model, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
search.fit(X_train, y_train)

# Get the best parameters and model
best_params = search.best_params_
best_model = search.best_estimator_

# Train the model on the training data
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Print out the best parameters
print("Best parameters:", search.best_params_)

## Support Vector Machine (SVM) Model

In [None]:
# Split the dataset into training and testing sets
X = scaled_df.drop(columns=['target'])
y = scaled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an SVM model with a linear kernel
model = SVC(kernel='linear')

# Set up a parameter grid to search over
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [0.1, 1, 10, 100],
}

# Perform a Grid Search over the parameter grid
search = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1)
search.fit(X_train, y_train)

# Get the best parameters and model
best_params = search.best_params_
best_model = search.best_estimator_

# Train the model on the training data
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Print the best parameters
print('Best Parameters:', best_params)