# ML-diagnosis-of-esophageal-cancer
## Supervised Machine Learning Model Selection
Authors: 

Date: 2023-03-18

In [1]:
# Dependencies & Installs
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [2]:
# Input 1
data = pd.read_csv('../Data_Cleaned/train_test_set.csv')

# Reclassified: 1 vs 2,3,4 healthy vs all
df1= data
df1['target'] = df1['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 1, 'BE-ID': 1, 'BE-LGD': 1, 'NSE': 0})
df1 = df1[df1.target<2]

# Reclassified: 2 vs 3&4 BE low vs BE-HGD & EAC
df2 = data
df2['target'] = df2['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df2 = df2[df2.target<2]

# Reclassified: 2 vs 3 BE low vs BE-HGD
df3 = data
df3['target'] = df3['Patient Group'].map({'BE-HGD': 1, 'EAC': 2, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df3 = df3[df3.target<2]

# Reclassified: 3 vs 4 BE-HGD vs AEC
df4 = data
df4['target'] = df4['Patient Group'].map({'BE-HGD': 0, 'EAC': 1, 'BE': 2, 'BE-ID': 2, 'BE-LGD': 2, 'NSE': 2})
df4 = df4[df4.target<2]

# Reclassified: 1&2 vs 3&4 
df5 = data
df5['target'] = df5['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 0})
df5 = df5[df5.target<2]

In [3]:
# Create X and y

X1 = df1.drop(['Patient Group', 'target'], axis=1)
y1 = df1['target']

X2 = df2.drop(['Patient Group', 'target'], axis=1)
y2 = df2['target']

X3 = df3.drop(['Patient Group', 'target'], axis=1)
y3 = df3['target']

X4 = df4.drop(['Patient Group', 'target'], axis=1)
y4 = df4['target']

X5 = df5.drop(['Patient Group', 'target'], axis=1)
y5 = df5['target']

print("Data 1 Shape: ", X1.shape, y1.shape)
print("Data 2 Shape: ", X2.shape, y2.shape)
print("Data 3 Shape: ", X3.shape, y3.shape)
print("Data 4 Shape: ", X4.shape, y4.shape)
print("Data 5 Shape: ", X5.shape, y5.shape)

Data 1 Shape:  (257, 190) (257,)
Data 2 Shape:  (204, 190) (204,)
Data 3 Shape:  (149, 190) (149,)
Data 4 Shape:  (93, 190) (93,)
Data 5 Shape:  (257, 190) (257,)


## Logistic Regression Model

In [6]:
warnings.filterwarnings('ignore')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Scale the data
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 500, 1000]
}

# Create a Logistic Regression model
model = LogisticRegression()

# Use GridSearchCV to search for the best hyperparameters
grid = GridSearchCV(model, param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print('Best Hyperparameters:', grid.best_params_)

# Train the model on the training data with the best hyperparameters
best_model = grid.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, f"Model_Saved/{name}_X_scaler.joblib")

# Save the model to the "Model_Saved" folder
joblib.dump(best_model, f"Model_Saved/{name}.joblib")

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


In [8]:
# Create function to optimise and run Logistic Regression model, save model and scaler.
def createLogReg(X, y, name):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Define the parameter grid to search over
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 500, 1000]
    }

    # Create a Logistic Regression model
    model = LogisticRegression()

    # Use GridSearchCV to search for the best hyperparameters
    grid = GridSearchCV(model, param_grid=param_grid, cv=5)
    grid.fit(X_train_scaled, y_train)

    # Print the best hyperparameters
    print('Best Hyperparameters:', grid.best_params_)

    # Train the model on the training data with the best hyperparameters
    best_model = grid.best_estimator_
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Save the scaler to the "Model_Saved" folder
    joblib.dump(X_scaler, f"Model_Saved/{name}_X_scaler.joblib")

    # Save the model to the "Model_Saved" folder
    joblib.dump(best_model, f"Model_Saved/{name}.joblib")

In [9]:
createLogReg(X1, y1, 'Input_1_Target_1')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


In [10]:
loaded_model = joblib.load("Model_Saved/Input_1_Target_1.joblib")
result = loaded_model.score(X_test_scaled, y_test)
print(result)

0.8269230769230769


In [11]:
createLogReg(X2, y2, 'Input_1_Target_2')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


In [12]:
createLogReg(X3, y3, 'Input_1_Target_3')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


In [13]:
createLogReg(X4, y4, 'Input_1_Target_4')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


In [14]:
createLogReg(X5, y5, 'Input_1_Target_5')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 82.69%


#### Logistic Regression model achieved same results for different target groups using input 1 ####

## Decision Tree Classifier Model

In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Scale the data
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a Decision Tree Classifier model
model = DecisionTreeClassifier()

# Set up a parameter grid to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 15),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 10),
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform a Randomized Search over the parameter grid
search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=5, random_state=42)
search.fit(X_train_scaled, y_train)

# Get the best parameters and model
best_params = search.best_params_
best_model = search.best_estimator_

# Train the model on the training data
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Print out the best parameters
print("Best parameters:", search.best_params_)

Accuracy: 71.15%
Best parameters: {'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_depth': 14, 'criterion': 'gini'}


In [17]:
def createDecTree(X, y):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create a Decision Tree Classifier model
    model = DecisionTreeClassifier()

    # Set up a parameter grid to search over
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': np.arange(3, 15),
        'min_samples_split': np.arange(2, 10),
        'min_samples_leaf': np.arange(1, 10),
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Perform a Randomized Search over the parameter grid
    search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=5, random_state=42)
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print out the best parameters
    print("Best parameters:", search.best_params_)

In [18]:
createDecTree(X1, y1)

Accuracy: 69.23%
Best parameters: {'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 14, 'criterion': 'entropy'}


In [20]:
createDecTree(X2, y2)

Accuracy: 51.22%
Best parameters: {'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'log2', 'max_depth': 13, 'criterion': 'gini'}


In [21]:
createDecTree(X3, y3)

Accuracy: 66.67%
Best parameters: {'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 'auto', 'max_depth': 8, 'criterion': 'gini'}


In [22]:
createDecTree(X4, y4)

Accuracy: 52.63%
Best parameters: {'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 12, 'criterion': 'gini'}


In [23]:
createDecTree(X5, y5)

Accuracy: 59.62%
Best parameters: {'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'entropy'}


#### Decision Tree models did not perform as well as Logistic Regression ####

## Random Forrest Classifier Model

In [24]:
def createRandomForest(X, y): 
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Define the parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

    # Create a random forest model
    model = RandomForestClassifier()

    # Create a GridSearchCV object
    search = GridSearchCV(model, param_grid=param_grid, cv=5)

    # Fit the GridSearchCV object to the data
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print out the best parameters
    print("Best parameters:", search.best_params_)

In [25]:
createRandomForest(X1, y1)

Accuracy: 80.77%
Best parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


#### Random Forest model performed almost as well as Logistic Regression. Processing time was just under 3 minutes. ####

## Support Vector Machine (SVM) Model

In [27]:
def createSVM(X, y):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create an SVM model with a linear kernel
    model = SVC(kernel='linear')

    # Set up a parameter grid to search over
    param_grid = {
        'C': [0.01, 0.1, 1, 10],
        'gamma': [0.1, 1, 10, 100],
    }

    # Perform a Grid Search over the parameter grid
    search = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1)
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print the best parameters
    print('Best Parameters:', best_params)

In [28]:
createSVM(X1, y1)

Accuracy: 84.62%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


In [29]:
createSVM(X2, y2)

Accuracy: 60.98%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


In [30]:
createSVM(X3, y3)

Accuracy: 76.67%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


In [31]:
createSVM(X4, y4)

Accuracy: 36.84%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


In [32]:
createSVM(X5, y5)

Accuracy: 59.62%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


#### SVM outperformed Logistic Regression for target 1 but performed poorly for the other target groups. There was little to no processing time (1-4 seconds). ####