In [1]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
    

# imports
from common.utils import get_data, get_preprocessor
from common.custom_logistic_regression import CustomLogisticRegressionMulticlass

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = get_preprocessor(numerical_column_names, categorical_column_names)

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=6)

datasets = []

for train_indices, test_indices in kfold.split(X, y):
    datasets.append((X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]))    

In [4]:
def run_grid_search(model, candidate_params, datasets, cv = 5):
    
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    best_params = []
    best_results = []
    
    for dataset in datasets:
    
        grid_search = GridSearchCV(pipeline, candidate_params, cv=cv, n_jobs=-1)
        grid_search.fit(dataset[0], dataset[2])
        
        best_params.append(grid_search.best_params_)

        y_pred = grid_search.predict(dataset[1])
        best_results.append({
            "Accuracy": accuracy_score(y_pred, dataset[3]),
            "Precision": precision_score(y_pred, dataset[3], average="weighted"),
            "Recall": recall_score(y_pred, dataset[3], average="weighted"),
            "F1": f1_score(y_pred, dataset[3], average="weighted"),
        })
        
        
    return pd.DataFrame(best_params).T, pd.DataFrame(best_results).T

In [5]:
model = RandomForestClassifier()

candidate_params = {
    "classifier__n_estimators": [100, 200, 300, 500],
    "classifier__max_depth": [30, None],
    "classifier__min_samples_split": [2, 3],
    # "classifier__min_samples_split": [2, 5, 10],
    # "classifier__min_samples_leaf": [1, 2, 4]
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__max_depth,30.0,30.0,,30.0,30.0
classifier__min_samples_split,2.0,3.0,3.0,2.0,2.0
classifier__n_estimators,100.0,500.0,200.0,500.0,500.0


Unnamed: 0,0,1,2,3,4
Accuracy,0.763842,0.770621,0.776271,0.776271,0.79638
Precision,0.810269,0.842958,0.82829,0.851834,0.851387
Recall,0.763842,0.770621,0.776271,0.776271,0.79638
F1,0.780112,0.794709,0.795232,0.800258,0.814733


In [6]:
model = KNeighborsClassifier()

candidate_params = {
    "classifier__n_neighbors": [3, 5, 7, 9, 11, 15, 31, 45, 61, 75, 101, 257],
    "classifier__p": [1, 2]          # 1 = manhattan, 2 = euclidean
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__n_neighbors,7,7,9,9,15
classifier__p,2,2,2,2,2


Unnamed: 0,0,1,2,3,4
Accuracy,0.694915,0.716384,0.714124,0.708475,0.7319
Precision,0.7369,0.784324,0.781271,0.781232,0.816449
Recall,0.694915,0.716384,0.714124,0.708475,0.7319
F1,0.709736,0.739304,0.736947,0.730165,0.756394


In [12]:
model = LogisticRegression()

candidate_params = [
    {
        "classifier__max_iter": [200, 300],
        "classifier__penalty": [None],
        "classifier__C": [0.1, 1.0],
        "classifier__solver": ["lbfgs"]
    },
    {
        "classifier__max_iter": [200, 300],
        "classifier__penalty": ["l1", "l2"],
        "classifier__C": [0.1, 1.0],
        "classifier__solver": ["liblinear"]
    }
    
]

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__C,1.0,1.0,1.0,0.1,0.1
classifier__max_iter,200,200,200,200,200
classifier__penalty,l2,l1,l1,l2,l2
classifier__solver,liblinear,liblinear,liblinear,liblinear,liblinear


Unnamed: 0,0,1,2,3,4
Accuracy,0.774011,0.760452,0.770621,0.762712,0.790724
Precision,0.817423,0.822228,0.81978,0.836119,0.847124
Recall,0.774011,0.760452,0.770621,0.762712,0.790724
F1,0.789304,0.782633,0.788959,0.786666,0.81015


In [13]:
model = CustomLogisticRegressionMulticlass(epochs=700)

candidate_params = {
    "classifier__batch_size": [128, 256, 512, None],
    "classifier__learning_rate": [0.01]
}

params, results = run_grid_search(model, candidate_params, datasets ,cv=3)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__batch_size,128.0,128.0,128.0,128.0,128.0
classifier__learning_rate,0.01,0.01,0.01,0.01,0.01


Unnamed: 0,0,1,2,3,4
Accuracy,0.780791,0.764972,0.769492,0.759322,0.791855
Precision,0.82942,0.837702,0.825094,0.830252,0.843355
Recall,0.780791,0.764972,0.769492,0.759322,0.791855
F1,0.797578,0.790825,0.790128,0.782883,0.809517
