In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


def load_data(path: str):
    data = pd.read_csv(path).drop(['Age', 'Gender', 'Snoring', 'Swallowing Difficulty'], axis=1)
    data['Level'] = data['Level'].map({'Normal': 0, 'Benign': 1, 'Malignant': 2})

    sampler = RandomOverSampler(random_state=3)
    X, y = sampler.fit_resample(data.drop('Level', axis=1), data['Level'])

    return train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

In [6]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


import json
from sklearn.model_selection import GridSearchCV
from joblib import load, dump

In [None]:
X_train, _, y_train, _ = load_data('../data/small_cancer_data.csv')

# (enabled, name, cls, param_grid)
tuning_specs = [
    (1, 'ada', AdaBoostClassifier, {
        'estimator': [DecisionTreeClassifier(), ExtraTreeClassifier()],
        'n_estimators': [i * 50 for i in range(1, 5)],
        'learning_rate': [i * 0.05 for i in range(1, 11)]
    }),
    (1, 'gbc', GradientBoostingClassifier, {
        'learning_rate': [i * 0.05 for i in range(1, 11)],
        'n_estimators': [i * 50 for i in range(1, 5)],
        'max_depth': [i * 5 for i in range(1, 4)]
    }),
    (1, 'rfc', RandomForestClassifier, {
        'n_estimators': [i * 100 for i in range(1, 11)],
        'max_depth': [i * 5 for i in range(1, 4)],
        'min_samples_split': [i * 2 for i in range(1, 6)],
        'min_samples_leaf': [i * 2 for i in range(1, 6)],
        'max_features': ['sqrt', 'log2']
    }),
    (1, 'log', LogisticRegression, {
        'penalty': ['l1', 'l2'],
        'C': [10 ** i for i in range(-3, 4)],
        'max_iter': [i * 500 for i in range(1, 6)]
    }),
    (1, 'gnb', GaussianNB, {}),
    (1, 'knn', KNeighborsClassifier, {
        'n_neighbors': [i * 2 for i in range(2, 11)],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree'],
        'leaf_size': [i * 10 for i in range(1, 11)]
    }),
    (1, 'mlp', MLPClassifier, {
        'hidden_layer_sizes': [(i * 10,) for i in range(1, 5)],
        'activation': ['tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'alpha': [10 ** i for i in range(-4, 0)],
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'batch_size': [i * 100 for i in range(1, 11)]
    }),
    (1, 'svc', SVC, {
        'C': [10 ** i for i in range(-3, 4)],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': [10 ** i for i in range(-3, 4)]
    })
]
with open('../models/tuned_hyperparams.json', 'r') as f:
    tuned_hyperparams = json.load(f)

for enabled, name, cls, param_grid in tuning_specs:
    if enabled:
        print(str.center(name.upper(), 35, '-'))
        model = GridSearchCV(cls(random_state=3), param_grid, n_jobs=2, verbose=2)
        model.fit(X_train, y_train)
        tuned_hyperparams[name] = model.best_params_

with open('../models/tuned_hyperparams.json', 'w') as f:
    json.dump(tuned_hyperparams, f)