In [2]:
# Dependencies & Installs
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [3]:
data = pd.read_csv('../Data_Cleaned/clinic_ratios.csv')

# Reclassified: 1 vs 2,3,4 healthy vs all
df1= data
df1['target'] = df1['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 1, 'BE-ID': 1, 'BE-LGD': 1, 'NSE': 0})
df1 = df1[df1.target<2]

# Reclassified: 2 vs 3&4 BE low vs BE-HGD & EAC
df2 = data
df2['target'] = df2['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df2 = df2[df2.target<2]

# Reclassified: 2 vs 3 BE low vs BE-HGD
df3 = data
df3['target'] = df3['Patient Group'].map({'BE-HGD': 1, 'EAC': 2, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df3 = df3[df3.target<2]

# Reclassified: 3 vs 4 BE-HGD vs AEC
df4 = data
df4['target'] = df4['Patient Group'].map({'BE-HGD': 0, 'EAC': 1, 'BE': 2, 'BE-ID': 2, 'BE-LGD': 2, 'NSE': 2})
df4 = df4[df4.target<2]

# Reclassified: 1&2 vs 3&4 
df5 = data
df5['target'] = df5['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 0})
df5 = df5[df5.target<2]

In [4]:
# Create X and y

X1 = df1.drop(['Patient Group', 'target'], axis=1)
y1 = df1['target']

X2 = df2.drop(['Patient Group', 'target'], axis=1)
y2 = df2['target']

X3 = df3.drop(['Patient Group', 'target'], axis=1)
y3 = df3['target']

X4 = df4.drop(['Patient Group', 'target'], axis=1)
y4 = df4['target']

X5 = df5.drop(['Patient Group', 'target'], axis=1)
y5 = df5['target']

print("Data 1 Shape: ", X1.shape, y1.shape)
print("Data 2 Shape: ", X2.shape, y2.shape)
print("Data 3 Shape: ", X3.shape, y3.shape)
print("Data 4 Shape: ", X4.shape, y4.shape)
print("Data 5 Shape: ", X5.shape, y5.shape)

Data 1 Shape:  (257, 28) (257,)
Data 2 Shape:  (204, 28) (204,)
Data 3 Shape:  (149, 28) (149,)
Data 4 Shape:  (93, 28) (93,)
Data 5 Shape:  (257, 28) (257,)


In [5]:
# Initiate the classifier model
classifier = LogisticRegression(max_iter=1000)

In [6]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model16 = classifier.fit(X_train_scaled, y_train)
print('Group 1: Healthy vs the rest')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model16.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model16, "Model_Saved/model16_LogisticRegression.joblib")

Group 1: Healthy vs the rest
Training Data Score: 0.8439024390243902
Testing Data Score: 0.7884615384615384
Accuracy: 78.85%


['Model_Saved/model16_LogisticRegression.joblib']

In [7]:
loaded_model = joblib.load("Model_Saved/model16_LogisticRegression.joblib")
result = loaded_model.score(X_test_scaled, y_test)
print(result)

0.7884615384615384


In [8]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model17 = classifier.fit(X_train_scaled, y_train)
print('Group 2: BE (low dysplasia) vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model17.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model17, "Model_Saved/model17_LogisticRegression.joblib")

Group 2: BE (low dysplasia) vs BE-HDG & AEC
Training Data Score: 0.7055214723926381
Testing Data Score: 0.5609756097560976
Accuracy: 56.10%


['Model_Saved/model17_LogisticRegression.joblib']

In [9]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model18 = classifier.fit(X_train_scaled, y_train)
print('Group 3: BE low vs BE-HDG')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model18.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model18, "Model_Saved/model18_LogisticRegression.joblib")

Group 3: BE low vs BE-HDG
Training Data Score: 0.7815126050420168
Testing Data Score: 0.6
Accuracy: 60.00%


['Model_Saved/model18_LogisticRegression.joblib']

In [10]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size=0.2, random_state=10)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model19 = classifier.fit(X_train_scaled, y_train)
print('Group 4: BE-HDG vs AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model19.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model19, "Model_Saved/model19_LogisticRegression.joblib")

Group 4: BE-HDG vs AEC
Training Data Score: 0.6891891891891891
Testing Data Score: 0.47368421052631576
Accuracy: 47.37%


['Model_Saved/model19_LogisticRegression.joblib']

In [11]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X5, y5, test_size=0.2, random_state=10)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model20 = classifier.fit(X_train_scaled, y_train)
print('Group 5: Healthy & BE low vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model20.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model20, "Model_Saved/model20_LogisticRegression.joblib")

Group 5: Healthy & BE low vs BE-HDG & AEC
Training Data Score: 0.697560975609756
Testing Data Score: 0.6923076923076923
Accuracy: 69.23%


['Model_Saved/model20_LogisticRegression.joblib']

In [12]:
# with open('results.csv', 'w', newline='') as file:
#      writer = csv.writer(file)
     
#      writer.writerow([model_tested])
#      writer.writerow([train_score])
#      writer.writerow([test_score])