In [59]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

data = pd.read_csv("../data/orig_cancer_data.csv")
data = pd.read_csv("../data/cancer_data.csv")

data = data.drop(["Age", "Gender", "Snoring", "Swallowing Difficulty"], axis=1)
data["Level"] = data["Level"].map({"Normal": 0, "Benign": 1, "Malignant": 2})

sampler = RandomOverSampler(random_state=4)
X, y = sampler.fit_resample(data.drop("Level", axis=1), data["Level"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, stratify=y)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from joblib import load, dump

In [60]:
from sklearn.neighbors import KNeighborsClassifier

# param_vals = {
#     "n_neighbors": list(range(1, 51)),
# }
# knn = GridSearchCV(KNeighborsClassifier(), param_vals, verbose=1)
# knn.fit(X_train, y_train)
# knn.best_params_

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print(classification_report(y_test, knn.predict(X_test)))

# knn = load("../models/knn.pkl")

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      7427
           1       0.79      0.72      0.75      7427
           2       0.89      0.91      0.90      7427

    accuracy                           0.83     22281
   macro avg       0.83      0.83      0.83     22281
weighted avg       0.83      0.83      0.83     22281



In [65]:
dump(knn, "../models/knn_83.pkl", 3)

['../models/knn_83.pkl']

In [61]:
from sklearn.svm import SVC

# param_vals = {
#     "C": [0.1, 0.5, 1, 5, 10, 50, 100, 500],
#     "gamma": [0.001,0.01,0.1,1,10,100]
# }
# svc = GridSearchCV(SVC(), param_vals, verbose=1)
# svc.fit(X_train, y_train)
# svc.best_params_

svc = SVC(C=50, gamma=0.001)
svc.fit(X_train, y_train)
print(classification_report(y_test, svc.predict(X_test)))

# svc = load("../models/svc.pkl")

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      7427
           1       0.79      0.79      0.79      7427
           2       0.90      0.92      0.91      7427

    accuracy                           0.85     22281
   macro avg       0.85      0.85      0.85     22281
weighted avg       0.85      0.85      0.85     22281



In [67]:
dump(svc, "../models/svc_85.pkl", 3)

['../models/svc_85.pkl']

In [62]:
from sklearn.linear_model import LogisticRegression

# param_vals = {
#     "C": [0.1, 0.5, 1, 5, 10, 50, 100, 500],
#     "max_iter": [1000, 1500, 2000, 2500]
# }
# lr = GridSearchCV(LogisticRegression(), param_vals, verbose=1)
# lr.fit(X_train, y_train)
# lr.best_params_

lr = LogisticRegression(C=50, max_iter=1000)
lr.fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

# lr = load("../models/lr.pkl")

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      7427
           1       0.75      0.70      0.73      7427
           2       0.87      0.91      0.89      7427

    accuracy                           0.82     22281
   macro avg       0.81      0.82      0.81     22281
weighted avg       0.81      0.82      0.81     22281



In [68]:
dump(lr, "../models/lr_82.pkl", 3)

['../models/lr_82.pkl']

In [63]:
from sklearn.ensemble import RandomForestClassifier

# param_vals = {
#     "n_estimators": [50, 100, 150, 200, 250],
#     "max_depth": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# }

# rfc = GridSearchCV(RandomForestClassifier(), param_vals, verbose=1)
# rfc.fit(X_train, y_train)
# rfc.best_params_

rfc = RandomForestClassifier(n_estimators=100, max_depth=50)
rfc.fit(X_train, y_train)
print(classification_report(y_test, rfc.predict(X_test)))

# rfc = load("../models/rfc.pkl")

              precision    recall  f1-score   support

           0       0.86      0.91      0.89      7427
           1       0.86      0.79      0.82      7427
           2       0.92      0.94      0.93      7427

    accuracy                           0.88     22281
   macro avg       0.88      0.88      0.88     22281
weighted avg       0.88      0.88      0.88     22281



In [69]:
dump(rfc, "../models/rfc_88.pkl", 3)

['../models/rfc_88.pkl']

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

# param_vals = {
#     "learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
#     "n_estimators": [50, 100, 150, 200, 250],
#     "max_depth": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# }
# gbc = GridSearchCV(GradientBoostingClassifier(), param_vals, verbose=1)
# gbc.fit(X_train, y_train)
# gbc.best_params_

gbc = GradientBoostingClassifier(learning_rate=0.15, n_estimators=100, max_depth=5)
gbc.fit(X_train, y_train)
print(classification_report(y_test, gbc.predict(X_test)))

# gbc = load("../models/gbc.pkl")

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      7427
           1       0.83      0.81      0.82      7427
           2       0.91      0.94      0.93      7427

    accuracy                           0.87     22281
   macro avg       0.87      0.87      0.87     22281
weighted avg       0.87      0.87      0.87     22281



In [70]:
dump(gbc, "../models/gbc_87.pkl", 3)

['../models/gbc_87.pkl']

In [71]:
from sklearn.ensemble import StackingClassifier

sc = StackingClassifier(
    estimators=[
        ("knn", knn),
        ("svc", svc),
        ("lr", lr),
        ("rfc", rfc),
        ("gbc", gbc),
    ],
    final_estimator=rfc,
)
sc.fit(X_train, y_train)
print(classification_report(y_test, sc.predict(X_test)))

# sc = load("../models/sc.pkl")

              precision    recall  f1-score   support

           0       0.86      0.87      0.87      7427
           1       0.83      0.81      0.82      7427
           2       0.92      0.94      0.93      7427

    accuracy                           0.87     22281
   macro avg       0.87      0.87      0.87     22281
weighted avg       0.87      0.87      0.87     22281



In [74]:
dump(sc, "../models/sc.pkl", 3)

['../models/sc_87.pkl']