In [184]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

from src.model_training_utils import tune_hyperparameters
from IPython.display import display

# Data Import

In [188]:
data = pd.read_csv('../data/final_set.csv')
data.drop("borrow_duration", axis=1, inplace=True)
numerical_features = ['distance', 'not_on_time_rate']
categorical_features = ['gender', 'education', 'occupation', 'price_range', 'pages_range', 'season', 'name']

## Logistic Regression
###
### Prepare Data and Define Pipeline

In [189]:
x = data.drop("borrow_duration_label", axis=1)
y = data['borrow_duration_label']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


preprocessor = ColumnTransformer(transformers=[
    ('num', 'passthrough', numerical_features),
    ('cat', OneHotEncoder(), categorical_features),
])

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ]
)


### Model Training and Test

In [190]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)))
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Unnamed: 0,not on time,on time,accuracy,macro avg,weighted avg
precision,0.648649,0.825243,0.806358,0.736946,0.785432
recall,0.307692,0.951493,0.806358,0.629592,0.806358
f1-score,0.417391,0.883882,0.806358,0.650637,0.778719
support,78.0,268.0,0.806358,346.0,346.0


Unnamed: 0,0,1
0,24,54
1,13,255


## RandomForest
###
### Data Preparation

In [191]:
label_encoder = LabelEncoder()

encoded_df = pd.get_dummies(data[["gender", "education", "occupation", "season"]])

for col in ['pages_range', "borrow_duration_label"]:
    encoded_df[col] = label_encoder.fit_transform(data[col])

encoded_df["distance"] = label_encoder.fit_transform(pd.qcut(data["distance"], q=[0, 0.1, 0.5, 0.75, 1]))

encoded_df["name"] = data["name"]
mapper = encoded_df.groupby('name')["borrow_duration_label"].mean()
encoded_df["name"] = encoded_df["name"].map(mapper)
    
x = encoded_df.drop("borrow_duration_label", axis=1)
y = encoded_df['borrow_duration_label']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Model Training and Testing

In [192]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)))
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.384615,0.794788,0.748555,0.589702,0.702322
recall,0.192308,0.910448,0.748555,0.551378,0.748555
f1-score,0.25641,0.848696,0.748555,0.552553,0.715175
support,78.0,268.0,0.748555,346.0,346.0


Unnamed: 0,0,1
0,15,63
1,24,244


### Tune Hyperparameters

In [193]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10, 15],
    'class_weight': [None, 'balanced'],
    'max_features': [6, 7, 8, 9, 10, 11, 12],
}

best_model = tune_hyperparameters(model=rf_model, parameters=param_grid, x_train=X_train, y_train=y_train)

y_pred = best_model.predict(X_test)

display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)))
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters:  {'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 9, 'max_depth': 40, 'class_weight': None}


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.777778,0.821317,0.817919,0.799547,0.811502
recall,0.269231,0.977612,0.817919,0.623421,0.817919
f1-score,0.4,0.892675,0.817919,0.646337,0.781609
support,78.0,268.0,0.817919,346.0,346.0


Unnamed: 0,0,1
0,21,57
1,6,262


## Gradient Boosting Classifier
###
### Model Training and Testing

In [194]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)))
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.65625,0.818471,0.803468,0.737361,0.781901
recall,0.269231,0.958955,0.803468,0.614093,0.803468
f1-score,0.381818,0.883162,0.803468,0.63249,0.770142
support,78.0,268.0,0.803468,346.0,346.0


Unnamed: 0,0,1
0,21,57
1,11,257


### Tune Hyperparameters

In [195]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate': [0.001, 0.01, 0.1],
    'subsample': [0.8, 0.9, 1.0],   
}

best_model = tune_hyperparameters(model=gb_model, parameters=param_grid, x_train=X_train, y_train=y_train)

y_pred = best_model.predict(X_test)

display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)))
display(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters:  {'subsample': 1.0, 'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 5, 'learning_rate': 0.01}


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.689655,0.817035,0.806358,0.753345,0.788319
recall,0.25641,0.966418,0.806358,0.611414,0.806358
f1-score,0.373832,0.88547,0.806358,0.629651,0.77013
support,78.0,268.0,0.806358,346.0,346.0


Unnamed: 0,0,1
0,20,58
1,9,259
