In [67]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
# imports
from common.utils import get_data, get_preprocessor

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [68]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

model = RandomForestClassifier(random_state=6, max_depth=None, n_estimators=300)

pipeline = Pipeline([
    ("preprocessing", get_preprocessor(numerical_column_names, categorical_column_names)),
    ("classifier", model)
])

In [69]:
def train_and_pred(pipeline, X_train, y_train):
    print(y_train.value_counts())
    print()
    
    pipeline.fit(X_train, y_train)
    y_pred_oversampled = pipeline.predict(X_test)

    print("Classification report")
    print(classification_report(y_test, y_pred_oversampled))

    print("\nTotal")
    print(f"\tAccuracy:  {accuracy_score(y_test, y_pred_oversampled):.3f}")
    print(f"\tPrecision: {precision_score(y_test, y_pred_oversampled, average='weighted'):.3f}")
    print(f"\tRecall:    {recall_score(y_test, y_pred_oversampled, average='weighted'):.3f}")
    print(f"\tF1-score:  {f1_score(y_test, y_pred_oversampled, average='weighted'):.3f}")

In [70]:
train_and_pred(pipeline, X_train, y_train)

Target
Graduate    1766
Dropout     1133
Enrolled     640
Name: count, dtype: int64

Classification report
              precision    recall  f1-score   support

     Dropout       0.81      0.75      0.78       288
    Enrolled       0.57      0.32      0.41       154
    Graduate       0.78      0.93      0.85       443

    accuracy                           0.76       885
   macro avg       0.72      0.67      0.68       885
weighted avg       0.75      0.76      0.75       885


Total
	Accuracy:  0.765
	Precision: 0.750
	Recall:    0.765
	F1-score:  0.748


In [71]:
ros = RandomOverSampler(random_state=16)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

train_and_pred(pipeline, X_oversampled, y_oversampled)

Target
Graduate    1766
Dropout     1766
Enrolled    1766
Name: count, dtype: int64

Classification report
              precision    recall  f1-score   support

     Dropout       0.82      0.74      0.78       288
    Enrolled       0.52      0.47      0.49       154
    Graduate       0.81      0.88      0.84       443

    accuracy                           0.77       885
   macro avg       0.72      0.70      0.71       885
weighted avg       0.76      0.77      0.76       885


Total
	Accuracy:  0.766
	Precision: 0.761
	Recall:    0.766
	F1-score:  0.762


In [72]:
rus = RandomUnderSampler(random_state=16)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

train_and_pred(pipeline, X_undersampled, y_undersampled)

Target
Dropout     640
Enrolled    640
Graduate    640
Name: count, dtype: int64

Classification report
              precision    recall  f1-score   support

     Dropout       0.84      0.70      0.76       288
    Enrolled       0.43      0.71      0.53       154
    Graduate       0.86      0.76      0.81       443

    accuracy                           0.73       885
   macro avg       0.71      0.72      0.70       885
weighted avg       0.78      0.73      0.75       885


Total
	Accuracy:  0.732
	Precision: 0.780
	Recall:    0.732
	F1-score:  0.746
