In [75]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
# imports
from common.utils import get_data, get_preprocessor

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

In [76]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

y.value_counts()

Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

In [77]:
ros = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = ros.fit_resample(X, y)

y_oversampled.value_counts()

Target
Dropout     2209
Graduate    2209
Enrolled    2209
Name: count, dtype: int64

In [78]:
ros = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = ros.fit_resample(X, y)

y_undersampled.value_counts()

Target
Dropout     794
Enrolled    794
Graduate    794
Name: count, dtype: int64

In [79]:
numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

model = RandomForestClassifier(random_state=6, max_depth=None, n_estimators=300)

pipeline = Pipeline([
    ("preprocessing", get_preprocessor(numerical_column_names, categorical_column_names)),
    ("classifier", model)
])

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

print("\nTotal")
print(f"\tPrecision: {precision_score(y_test, y_pred, average='weighted'):.3f}")
print(f"\tRecall:    {recall_score(y_test, y_pred, average='weighted'):.3f}")
print(f"\tF1-score:  {f1_score(y_test, y_pred, average='weighted'):.3f}")

              precision    recall  f1-score   support

     Dropout       0.81      0.75      0.78       288
    Enrolled       0.57      0.32      0.41       154
    Graduate       0.78      0.93      0.85       443

    accuracy                           0.76       885
   macro avg       0.72      0.67      0.68       885
weighted avg       0.75      0.76      0.75       885


Total
	Precision: 0.750
	Recall:    0.765
	F1-score:  0.748


In [81]:
X_oversampled_train, X_oversampled_test, y_oversampled_train, y_oversampled_test = train_test_split(
    X_oversampled, y_oversampled, test_size=0.2, random_state=6
)

pipeline.fit(X_oversampled_train, y_oversampled_train)
y__oversampled_pred = pipeline.predict(X_oversampled_test)

print(classification_report(y_oversampled_test, y__oversampled_pred))

print("\nTotal")
print(f"\tPrecision: {precision_score(y_oversampled_test, y__oversampled_pred, average='weighted'):.3f}")
print(f"\tRecall:    {recall_score(y_oversampled_test, y__oversampled_pred, average='weighted'):.3f}")
print(f"\tF1-score:  {f1_score(y_oversampled_test, y__oversampled_pred, average='weighted'):.3f}")

              precision    recall  f1-score   support

     Dropout       0.94      0.87      0.90       441
    Enrolled       0.89      0.97      0.93       433
    Graduate       0.92      0.91      0.91       452

    accuracy                           0.91      1326
   macro avg       0.92      0.92      0.91      1326
weighted avg       0.92      0.91      0.91      1326


Total
	Precision: 0.916
	Recall:    0.915
	F1-score:  0.914


In [82]:
X_undersampled_train, X_undersampled_test, y_undersampled_train, y_undersampled_test = train_test_split(
    X_undersampled, y_undersampled, test_size=0.2, random_state=6
)

pipeline.fit(X_undersampled_train, y_undersampled_train)
y_undersampled_pred = pipeline.predict(X_undersampled_test)

print(classification_report(y_undersampled_test, y_undersampled_pred))

print("\nTotal")
print(f"\tPrecision: {precision_score(y_undersampled_test, y_undersampled_pred, average='weighted'):.3f}")
print(f"\tRecall:    {recall_score(y_undersampled_test, y_undersampled_pred, average='weighted'):.3f}")
print(f"\tF1-score:  {f1_score(y_undersampled_test, y_undersampled_pred, average='weighted'):.3f}")

              precision    recall  f1-score   support

     Dropout       0.74      0.75      0.75       140
    Enrolled       0.73      0.61      0.67       178
    Graduate       0.72      0.85      0.78       159

    accuracy                           0.73       477
   macro avg       0.73      0.74      0.73       477
weighted avg       0.73      0.73      0.73       477


Total
	Precision: 0.732
	Recall:    0.732
	F1-score:  0.728
