In [1]:
# Dependencies & Installs
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
import joblib

In [2]:
data = pd.read_csv('../Data_Cleaned/clinic_proteins.csv')

# Reclassified: 1 vs 2,3,4 healthy vs all
df1= data
df1['target'] = df1['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 1, 'BE-ID': 1, 'BE-LGD': 1, 'NSE': 0})
df1 = df1[df1.target<2]

# Reclassified: 2 vs 3&4 BE low vs BE-HGD & EAC
df2 = data
df2['target'] = df2['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df2 = df2[df2.target<2]

# Reclassified: 2 vs 3 BE low vs BE-HGD
df3 = data
df3['target'] = df3['Patient Group'].map({'BE-HGD': 1, 'EAC': 2, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 2})
df3 = df3[df3.target<2]

# Reclassified: 3 vs 4 BE-HGD vs AEC
df4 = data
df4['target'] = df4['Patient Group'].map({'BE-HGD': 0, 'EAC': 1, 'BE': 2, 'BE-ID': 2, 'BE-LGD': 2, 'NSE': 2})
df4 = df4[df4.target<2]

# Reclassified: 1&2 vs 3&4 
df5 = data
df5['target'] = df5['Patient Group'].map({'BE-HGD': 1, 'EAC': 1, 'BE': 0, 'BE-ID': 0, 'BE-LGD': 0, 'NSE': 0})
df5 = df5[df5.target<2]

In [3]:
# Create X and y

X1 = df1.drop(['Patient Group', 'target'], axis=1)
y1 = df1['target']

X2 = df2.drop(['Patient Group', 'target'], axis=1)
y2 = df2['target']

X3 = df3.drop(['Patient Group', 'target'], axis=1)
y3 = df3['target']

X4 = df4.drop(['Patient Group', 'target'], axis=1)
y4 = df4['target']

X5 = df5.drop(['Patient Group', 'target'], axis=1)
y5 = df5['target']

print("Data 1 Shape: ", X1.shape, y1.shape)
print("Data 2 Shape: ", X2.shape, y2.shape)
print("Data 3 Shape: ", X3.shape, y3.shape)
print("Data 4 Shape: ", X4.shape, y4.shape)
print("Data 5 Shape: ", X5.shape, y5.shape)

Data 1 Shape:  (257, 165) (257,)
Data 2 Shape:  (204, 165) (204,)
Data 3 Shape:  (149, 165) (149,)
Data 4 Shape:  (93, 165) (93,)
Data 5 Shape:  (257, 165) (257,)


In [4]:
# Initiate the classifier model
classifier = LogisticRegression(max_iter=1000)

In [5]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model11 = classifier.fit(X_train_scaled, y_train)
print('Group 1: Healthy vs the rest')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model11.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model11, "Model_Saved/model11_LogisticRegression.joblib")

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model11_X_scaler.joblib")

Group 1: Healthy vs the rest
Training Data Score: 0.9560975609756097
Testing Data Score: 0.7692307692307693
Accuracy: 76.92%


['Model_Saved/model11_X_scaler.joblib']

In [10]:
loaded_model = joblib.load("Model_Saved/model11_LogisticRegression.joblib")
result = loaded_model.score(X_test_scaled, y_test)
print(result)

0.7692307692307693


In [6]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model12 = classifier.fit(X_train_scaled, y_train)
print('Group 2: BE (low dysplasia) vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model12.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model12, "Model_Saved/model12_LogisticRegression.joblib")

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model12_X_scaler.joblib")

Group 2: BE (low dysplasia) vs BE-HDG & AEC
Training Data Score: 0.9815950920245399
Testing Data Score: 0.5121951219512195
Accuracy: 51.22%


['Model_Saved/model12_X_scaler.joblib']

In [7]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model13 = classifier.fit(X_train_scaled, y_train)
print('Group 3: BE low vs BE-HDG')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model13.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model13, "Model_Saved/model13_LogisticRegression.joblib")

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model13_X_scaler.joblib")

Group 3: BE low vs BE-HDG
Training Data Score: 0.9915966386554622
Testing Data Score: 0.6333333333333333
Accuracy: 63.33%


['Model_Saved/model13_X_scaler.joblib']

In [12]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X4, y4, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model14 = classifier.fit(X_train_scaled, y_train)
print('Group 4: BE-HDG vs AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model14.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model14, "Model_Saved/model14_LogisticRegression.joblib")

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model14_X_scaler.joblib")

Group 4: BE-HDG vs AEC
Training Data Score: 0.9864864864864865
Testing Data Score: 0.47368421052631576
Accuracy: 47.37%


['Model_Saved/model14_X_scaler.joblib']

In [13]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X5, y5, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit, train and test the model
model15 = classifier.fit(X_train_scaled, y_train)
print('Group 5: Healthy & BE low vs BE-HDG & AEC')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model15.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Save the model to the "Model_Saved" folder
joblib.dump(model15, "Model_Saved/model15_LogisticRegression.joblib")

# Save the scaler to the "Model_Saved" folder
joblib.dump(X_scaler, "Model_Saved/model15_X_scaler.joblib")

Group 5: Healthy & BE low vs BE-HDG & AEC
Training Data Score: 0.9512195121951219
Testing Data Score: 0.6153846153846154
Accuracy: 61.54%


['Model_Saved/model15_X_scaler.joblib']

In [None]:
# with open('results.csv', 'w', newline='') as file:
#      writer = csv.writer(file)
     
#      writer.writerow([model_tested])
#      writer.writerow([train_score])
#      writer.writerow([test_score])