In [6]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    

# imports
from common.utils import get_data, get_preprocessor

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [7]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

labelEncoder = LabelEncoder()
y_encoded = labelEncoder.fit_transform(y)

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = get_preprocessor(numerical_column_names, categorical_column_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [8]:
def run_grid_search(model, candidate_params, cv = 5):
    
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    grid_search = GridSearchCV(pipeline, candidate_params, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    
    return grid_search.best_params_, classification_report(y_test, y_pred, target_names=labelEncoder.classes_)

In [9]:
model = RandomForestClassifier(random_state=6)

candidate_params = {
    "classifier__n_estimators": [50, 100, 200, 300, 500],
    "classifier__max_depth": [None, 10, 20, 30]
    # "classifier__min_samples_split": [2, 5, 10],
    # "classifier__min_samples_leaf": [1, 2, 4]
}

best_params, classification = run_grid_search(model, candidate_params)
print("Best params:")
for param, value in best_params.items():
    print(f"\t{param}: {value}")
print("\nClassification:")
print(classification)

Best params:
	classifier__max_depth: None
	classifier__n_estimators: 300

Classification:
              precision    recall  f1-score   support

     Dropout       0.81      0.75      0.78       288
    Enrolled       0.57      0.32      0.41       154
    Graduate       0.78      0.93      0.85       443

    accuracy                           0.76       885
   macro avg       0.72      0.67      0.68       885
weighted avg       0.75      0.76      0.75       885



In [10]:
model = KNeighborsClassifier()

candidate_params = {
    "classifier__n_neighbors": [3, 5, 7, 9, 11, 15, 31, 45, 61, 75, 101, 257],
    "classifier__p": [1, 2]          # 1 = manhattan, 2 = euclidean
}

best_params, classification = run_grid_search(model, candidate_params)
print("Best params:")
for param, value in best_params.items():
    print(f"\t{param}: {value}")
print("\nClassification:")
print(classification)

        nan 0.71460816        nan 0.71461015        nan 0.71545881
        nan 0.70924652        nan 0.70387689        nan 0.69850766
        nan 0.69285475        nan 0.68239717        nan 0.65922654]


Best params:
	classifier__n_neighbors: 15
	classifier__p: 2

Classification:
              precision    recall  f1-score   support

     Dropout       0.82      0.65      0.73       288
    Enrolled       0.40      0.21      0.27       154
    Graduate       0.72      0.93      0.81       443

    accuracy                           0.72       885
   macro avg       0.65      0.60      0.60       885
weighted avg       0.70      0.72      0.69       885



# Classification metrics
* accuracy - (TP+TN) / (TP+TN+FP+FN)
* precision - TP / (TP+FP)
* recall - TP / (TP+FN)