In [36]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
    

# imports
from common.utils import get_data, get_preprocessor
from common.custom_logistic_regression import CustomLogisticRegressionMulticlass

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [37]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = get_preprocessor(numerical_column_names, categorical_column_names)

In [38]:
kfold = KFold(n_splits=5, shuffle=True, random_state=6)

datasets = []

for train_indices, test_indices in kfold.split(X, y):
    datasets.append((X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]))    

In [39]:
def run_grid_search(model, candidate_params, datasets, cv = 5):
    
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    best_params = []
    best_results = []
    
    for dataset in datasets:
    
        grid_search = GridSearchCV(pipeline, candidate_params, cv=cv, n_jobs=-1)
        grid_search.fit(dataset[0], dataset[2])
        
        best_params.append(grid_search.best_params_)

        y_pred = grid_search.predict(dataset[1])
        best_results.append({
            "Accuracy": accuracy_score(y_pred, dataset[3]),
            "Precision": precision_score(y_pred, dataset[3], average="weighted"),
            "Recall": recall_score(y_pred, dataset[3], average="weighted"),
            "F1": f1_score(y_pred, dataset[3], average="weighted"),
        })
        
        
    return pd.DataFrame(best_params).T, pd.DataFrame(best_results).T

In [40]:
model = RandomForestClassifier()

candidate_params = {
    "classifier__n_estimators": [100, 200, 300, 500],
    "classifier__max_depth": [30, None],
    "classifier__min_samples_split": [2, 3],
    # "classifier__min_samples_split": [2, 5, 10],
    # "classifier__min_samples_leaf": [1, 2, 4]
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__max_depth,30.0,,,,
classifier__min_samples_split,3.0,2.0,2.0,2.0,3.0
classifier__n_estimators,300.0,300.0,500.0,200.0,200.0


Unnamed: 0,0,1,2,3,4
Accuracy,0.764972,0.781921,0.767232,0.775141,0.781674
Precision,0.814324,0.850223,0.832801,0.837867,0.844773
Recall,0.764972,0.781921,0.767232,0.775141,0.781674
F1,0.782217,0.803954,0.791855,0.795001,0.803699


In [41]:
model = KNeighborsClassifier()

candidate_params = {
    "classifier__n_neighbors": [3, 5, 7, 9, 11, 15, 31, 45, 61, 75, 101, 257],
    "classifier__p": [1, 2]          # 1 = manhattan, 2 = euclidean
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__n_neighbors,7,7,9,9,15
classifier__p,2,2,2,2,2


Unnamed: 0,0,1,2,3,4
Accuracy,0.694915,0.716384,0.714124,0.708475,0.7319
Precision,0.7369,0.784324,0.781271,0.781232,0.816449
Recall,0.694915,0.716384,0.714124,0.708475,0.7319
F1,0.709736,0.739304,0.736947,0.730165,0.756394


In [42]:
model = LogisticRegression(solver="lbfgs")

candidate_params = {
    "classifier__max_iter": [200, 300],
    "classifier__penalty": [None, "l2"],
    "classifier__C": [0.1, 1.0]
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__C,0.1,1.0,1.0,0.1,0.1
classifier__max_iter,200,200,200,200,200
classifier__penalty,l2,l2,l2,l2,l2


Unnamed: 0,0,1,2,3,4
Accuracy,0.784181,0.768362,0.777401,0.764972,0.799774
Precision,0.820283,0.8176,0.809031,0.823896,0.837643
Recall,0.784181,0.768362,0.777401,0.764972,0.799774
F1,0.796959,0.785436,0.789047,0.784403,0.81287


In [43]:
model = LogisticRegression(solver="lbfgs")

candidate_params = {
    "classifier__max_iter": [200, 300],
    "classifier__penalty": [None, "l2"],
    "classifier__C": [0.1, 1.0]
}

params, results = run_grid_search(model, candidate_params, datasets)
display(params)
display(results)

Unnamed: 0,0,1,2,3,4
classifier__C,0.1,1.0,1.0,0.1,0.1
classifier__max_iter,200,200,200,200,200
classifier__penalty,l2,l2,l2,l2,l2


Unnamed: 0,0,1,2,3,4
Accuracy,0.784181,0.768362,0.777401,0.764972,0.799774
Precision,0.820283,0.8176,0.809031,0.823896,0.837643
Recall,0.784181,0.768362,0.777401,0.764972,0.799774
F1,0.796959,0.785436,0.789047,0.784403,0.81287


In [44]:
# model = CustomLogisticRegressionMulticlass(epochs=700)

# candidate_params = {
#     "classifier__batch_size": [256, 128, 512, None],
#     "classifier__learning_rate": [0.01]
# }

# params, results = run_grid_search(model, candidate_params, datasets ,cv=3)
# display(params)
# display(results)