# ML-diagnosis-of-esophageal-cancer
## Supervised Machine Learning Model Selection
## Input 4: Clinical data and protein ratios ### 
Authors: 

Date: 2023-03-18

In [1]:
# Dependencies & Installs
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [2]:
# Input 1
data = pd.read_csv('../Data_Cleaned/clinic_ratios.csv')

# Reclassified: 1 vs 2,3,4 healthy vs all
df1= data
df1['target'] = df1['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 1, 'BE-ID': 1, 'BE-LGD': 1, 'NSE': 0})
df1 = df1[df1.target<2]

# Reclassified: 2 vs 3&4 BE low vs BE-HGD & EAC
df2 = data
df2['target'] = df2['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df2 = df2[df2.target<2]

# Reclassified: 2 vs 3 BE low vs BE-HGD
df3 = data
df3['target'] = df3['Patient Group'].map({'BE-HGD': 1, 'EAC': 2, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df3 = df3[df3.target<2]

In [3]:
# Create X and y

X1 = df1.drop(['Patient Group', 'target'], axis=1)
y1 = df1['target']

X2 = df2.drop(['Patient Group', 'target'], axis=1)
y2 = df2['target']

X3 = df3.drop(['Patient Group', 'target'], axis=1)
y3 = df3['target']

print("Data 1 Shape: ", X1.shape, y1.shape)
print("Data 2 Shape: ", X2.shape, y2.shape)
print("Data 3 Shape: ", X3.shape, y3.shape)

Data 1 Shape:  (257, 28) (257,)
Data 2 Shape:  (204, 28) (204,)
Data 3 Shape:  (149, 28) (149,)


## Logistic Regression Model

In [4]:
warnings.filterwarnings('ignore')

In [5]:
# Create function to optimise and run Logistic Regression model, save model and scaler.
def createLogReg(X, y, name):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Define the parameter grid to search over
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 500, 1000]
    }

    # Create a Logistic Regression model
    model = LogisticRegression()

    # Use GridSearchCV to search for the best hyperparameters
    grid = GridSearchCV(model, param_grid=param_grid, cv=5)
    grid.fit(X_train_scaled, y_train)

    # Print the best hyperparameters
    print('Best Hyperparameters:', grid.best_params_)

    # Train the model on the training data with the best hyperparameters
    best_model = grid.best_estimator_
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Save the scaler to the "Model_Saved" folder
    joblib.dump(X_scaler, f"Model_Saved/{name}_X_scaler.joblib")

    # Save the model to the "Model_Saved" folder
    joblib.dump(best_model, f"Model_Saved/{name}.joblib")

In [6]:
createLogReg(X1, y1, 'Input_4_Target_1')

Best Hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 78.85%


In [7]:
createLogReg(X2, y2, 'Input_4_Target_2')

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'none', 'solver': 'saga'}
Accuracy: 78.85%


In [8]:
createLogReg(X3, y3, 'Input_4_Target_3')

Best Hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 78.85%


#### Logistic Regression model achieved same results for different target groups using input 4. Accuracy score 78.85% ####

## Decision Tree Classifier Model

In [9]:
def createDecTree(X, y):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create a Decision Tree Classifier model
    model = DecisionTreeClassifier()

    # Set up a parameter grid to search over
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': np.arange(3, 15),
        'min_samples_split': np.arange(2, 10),
        'min_samples_leaf': np.arange(1, 10),
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Perform a Randomized Search over the parameter grid
    search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=5, random_state=42)
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print out the best parameters
    print("Best parameters:", search.best_params_)

In [10]:
createDecTree(X1, y1)

Accuracy: 76.92%
Best parameters: {'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'entropy'}


In [11]:
createDecTree(X2, y2)

Accuracy: 51.22%
Best parameters: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 14, 'criterion': 'gini'}


In [12]:
createDecTree(X3, y3)

Accuracy: 66.67%
Best parameters: {'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 14, 'criterion': 'gini'}


#### Decision Tree models achieved performance standard  target 1 only.####
Target 1: 76.92%
Target 2: 51.22%
Target 3: 66.67% 

## Random Forrest Classifier Model

In [13]:
def createRandomForest(X, y): 
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Define the parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

    # Create a random forest model
    model = RandomForestClassifier()

    # Create a GridSearchCV object
    search = GridSearchCV(model, param_grid=param_grid, cv=5)

    # Fit the GridSearchCV object to the data
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print out the best parameters
    print("Best parameters:", search.best_params_)

In [14]:
createRandomForest(X1, y1)

Accuracy: 80.77%
Best parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [20]:
createRandomForest(X2, y2)

Accuracy: 51.22%
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}


In [15]:
createRandomForest(X3, y3)

Accuracy: 76.67%
Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}


#### Random Forest model preformed slightly worse than Decision Tree model overall. It achieved performance standard with target 1 and 3. Processing time was nearly 5 minutes ####
Target 1: 80.77%
Target 2: 51.22%
Target 3: 76.67%

## Support Vector Machine (SVM) Model

In [14]:
def createSVM(X, y, name):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the data
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create an SVM model with a linear kernel
    model = SVC(kernel='linear')

    # Set up a parameter grid to search over
    param_grid = {
        'C': [0.01, 0.1, 1, 10],
        'gamma': [0.1, 1, 10, 100],
    }

    # Perform a Grid Search over the parameter grid
    search = GridSearchCV(model, param_grid=param_grid, cv=5, n_jobs=-1)
    search.fit(X_train_scaled, y_train)

    # Get the best parameters and model
    best_params = search.best_params_
    best_model = search.best_estimator_

    # Train the model on the training data
    best_model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test_scaled)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}%'.format(accuracy * 100))

    # Print the best parameters
    print('Best Parameters:', best_params)
    
    # Save the scaler to the "Model_Saved" folder
    joblib.dump(X_scaler, f"Model_Saved/{name}_X_scaler.joblib")

    # Save the model to the "Model_Saved" folder
    joblib.dump(best_model, f"Model_Saved/{name}.joblib")

In [15]:
createSVM(X1, y1, 'SVM_input_4_target_1')

Accuracy: 84.62%
Best Parameters: {'C': 0.1, 'gamma': 0.1}


In [16]:
createSVM(X2, y2, 'SVM_input_4_target_2')

Accuracy: 65.85%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


In [17]:
createSVM(X3, y3, 'SVM_input_4_target_3')

Accuracy: 76.67%
Best Parameters: {'C': 0.01, 'gamma': 0.1}


#### SVM outperformed Logistic Regression for target 1, performed poorly for target 2, and achieved performance standard for target 3. There was little to no processing time (1-4 seconds). ####
Target 1: 84.62%
Target 2: 65.85%
Target 3: 76.67%