In [73]:
import pandas as pd

data = pd.read_csv("data/orig_cancer_data.csv")
data = pd.read_csv("data/cancer_data.csv")

data = data.drop(["Age", "Gender", "Snoring", "Swallowing Difficulty"], axis=1)
data["Level"] = data["Level"].map({"Normal": 0, "Benign": 1, "Malignant": 2})

In [74]:
from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler(random_state=4)
X, y = sampler.fit_resample(data.drop("Level", axis=1), data["Level"])

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, stratify=y)

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

In [72]:
from sklearn.neighbors import KNeighborsClassifier

param_vals = {
    "n_neighbors": list(range(1, 51)),
}

knn = RandomizedSearchCV(KNeighborsClassifier(), param_vals, n_iter=50)
knn.fit(X_train, y_train)

knn.best_params_

{'n_neighbors': 1}

In [76]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      7427
           1       0.79      0.72      0.75      7427
           2       0.89      0.91      0.90      7427

    accuracy                           0.83     22281
   macro avg       0.83      0.83      0.83     22281
weighted avg       0.83      0.83      0.83     22281



In [46]:
from sklearn.svm import SVC

param_vals = {
    "C": [0.1, 0.5, 1, 5, 10, 50, 100, 500],
    "gamma": [0.001,0.01,0.1,1,10,100]
}

svc = RandomizedSearchCV(SVC(), param_vals, n_iter=48)
svc.fit(X_train, y_train)

svc.best_params_

{'gamma': 0.001, 'C': 50}

In [77]:
svc = SVC(C=50, gamma=0.001)
svc.fit(X_train, y_train)

print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      7427
           1       0.79      0.79      0.79      7427
           2       0.90      0.92      0.91      7427

    accuracy                           0.85     22281
   macro avg       0.85      0.85      0.85     22281
weighted avg       0.85      0.85      0.85     22281



In [48]:
from sklearn.linear_model import LogisticRegression

param_vals = {
    "C": [0.1, 0.5, 1, 5, 10, 50, 100, 500],
    "max_iter": [1000, 1500, 2000, 2500]
}

lr = RandomizedSearchCV(LogisticRegression(), param_vals, n_iter=32)
lr.fit(X_train, y_train)

lr.best_params_

{'max_iter': 1000, 'C': 50}

In [78]:
lr = LogisticRegression(C=50, max_iter=1000)
lr.fit(X_train, y_train)

print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      7427
           1       0.75      0.70      0.73      7427
           2       0.87      0.91      0.89      7427

    accuracy                           0.82     22281
   macro avg       0.81      0.82      0.81     22281
weighted avg       0.81      0.82      0.81     22281



In [55]:
from sklearn.ensemble import RandomForestClassifier

param_vals = {
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
}

rfc = RandomizedSearchCV(RandomForestClassifier(), param_vals, n_iter=50)
rfc.fit(X_train, y_train)

rfc.best_params_

{'n_estimators': 100, 'max_depth': 50}

In [79]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=50)
rfc.fit(X_train, y_train)

print(classification_report(y_test, rfc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88      7427
           1       0.86      0.79      0.82      7427
           2       0.91      0.94      0.93      7427

    accuracy                           0.88     22281
   macro avg       0.88      0.88      0.88     22281
weighted avg       0.88      0.88      0.88     22281



In [57]:
from sklearn.ensemble import GradientBoostingClassifier

param_vals = {
    "learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
}

gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_vals, n_iter=300)
gbc.fit(X_train, y_train)

gbc.best_params_

{'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.05}

In [80]:
gbc = GradientBoostingClassifier(learning_rate=0.05, n_estimators=50, max_depth=5)
gbc.fit(X_train, y_train)

print(classification_report(y_test, gbc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.86      0.86      7427
           1       0.82      0.79      0.80      7427
           2       0.90      0.93      0.91      7427

    accuracy                           0.86     22281
   macro avg       0.86      0.86      0.86     22281
weighted avg       0.86      0.86      0.86     22281



In [82]:
from joblib import dump

dump(knn, "knn.pkl", 3)
dump(svc, "svc.pkl", 3)
dump(lr, "lr.pkl", 3)
dump(rfc, "rfc.pkl", 3)
dump(gbc, "gbc.pkl", 3)

['gbc.pkl']