In [1]:
# Dependencies & Installs
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [2]:
data = pd.read_csv('../Data_Cleaned/train_test_set.csv')

# Reclassified: 1 vs 2,3,4 healthy vs all
df1= data
df1['target'] = df1['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 1, 'BE-ID': 1, 'BE-LGD': 1, 'NSE': 0})
df1 = df1[df1.target<2]

# Reclassified: 2 vs 3&4 BE low vs BE-HGD & EAC
df2 = data
df2['target'] = df2['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df2 = df2[df2.target<2]

# Reclassified: 2 vs 3 BE low vs BE-HGD
df3 = data
df3['target'] = df3['Patient Group'].map({'BE-HGD': 1, 'EAC': 2, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df3 = df3[df3.target<2]

# Reclassified: 3 vs 4 BE-HGD vs AEC
df4 = data
df4['target'] = df4['Patient Group'].map({'BE-HGD': 0, 'EAC': 1, 'BE': 2, 'BE-ID': 2, 'BE-LGD': 2, 'NSE': 2})
df4 = df4[df4.target<2]

# Reclassified: 1&2 vs 3&4 
df5 = data
df5['target'] = df5['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 0})
df5 = df5[df5.target<2]

In [3]:
# Create X and y

X1 = df1.drop(['Patient Group', 'target'], axis=1)
y1 = df1['target']

X2 = df2.drop(['Patient Group', 'target'], axis=1)
y2 = df2['target']

X3 = df3.drop(['Patient Group', 'target'], axis=1)
y3 = df3['target']

X4 = df4.drop(['Patient Group', 'target'], axis=1)
y4 = df4['target']

X5 = df5.drop(['Patient Group', 'target'], axis=1)
y5 = df5['target']

print("Data 1 Shape: ", X1.shape, y1.shape)
print("Data 2 Shape: ", X2.shape, y2.shape)
print("Data 3 Shape: ", X3.shape, y3.shape)
print("Data 4 Shape: ", X4.shape, y4.shape)
print("Data 5 Shape: ", X5.shape, y5.shape)

Data 1 Shape:  (257, 190) (257,)
Data 2 Shape:  (204, 190) (204,)
Data 3 Shape:  (149, 190) (149,)
Data 4 Shape:  (93, 190) (93,)
Data 5 Shape:  (257, 190) (257,)


In [4]:
# Initiate the classifier model
classifier = LogisticRegression(max_iter=1000)

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model1_X_scaler.joblib")

# Fit, train and test the model
model1 = classifier.fit(X_train_scaled, y_train)
print('Group 1: Healthy vs the rest')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model1.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model1, "Model_Saved/model1_LogisticRegression.joblib")

Group 1: Healthy vs the rest
Training Data Score: 0.9658536585365853
Testing Data Score: 0.7884615384615384
Accuracy: 78.85%


['Model_Saved/model1_LogisticRegression.joblib']

In [5]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model2_X_scaler.joblib")

# Fit, train and test the model
model2 = classifier.fit(X_train_scaled, y_train)
print('Group 2: BE (low dysplasia) vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model2.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model2, "Model_Saved/model2_LogisticRegression.joblib")

Group 2: BE (low dysplasia) vs BE-HDG & AEC
Training Data Score: 0.9815950920245399
Testing Data Score: 0.5609756097560976
Accuracy: 56.10%


['Model_Saved/model2_LogisticRegression.joblib']

In [7]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model3_X_scaler.joblib")

# Fit, train and test the model
model3 = classifier.fit(X_train_scaled, y_train)
print('Group 3: BE low vs BE-HDG')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model3.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model3, "Model_Saved/model3_LogisticRegression.joblib")

Group 3: BE low vs BE-HDG
Training Data Score: 0.9915966386554622
Testing Data Score: 0.6333333333333333
Accuracy: 63.33%


['Model_Saved/model3_LogisticRegression.joblib']

In [8]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size=0.2, random_state=10)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model4_X_scaler.joblib")

# Fit, train and test the model
model4 = classifier.fit(X_train_scaled, y_train)
print('Group 4: BE-HDG vs AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model4.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model4, "Model_Saved/model4_LogisticRegression.joblib")

Group 4: BE-HDG vs AEC
Training Data Score: 1.0
Testing Data Score: 0.42105263157894735
Accuracy: 42.11%


['Model_Saved/model4_LogisticRegression.joblib']

In [9]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X5, y5, test_size=0.2, random_state=10)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model5_X_scaler.joblib")

# Fit, train and test the model
model5 = classifier.fit(X_train_scaled, y_train)
print('Group 5: Healthy & BE low vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model5.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model5, "Model_Saved/model5_LogisticRegression.joblib")

Group 5: Healthy & BE low vs BE-HDG & AEC
Training Data Score: 0.9463414634146341
Testing Data Score: 0.6538461538461539
Accuracy: 65.38%


['Model_Saved/model5_LogisticRegression.joblib']