In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_parquet('./data/catB_train.parquet')

In [19]:
df = df[df['cltdob_fix']!='None']
df['cltdob_fix'] = pd.to_datetime(df.iloc[:, 6], format ='mixed')
df['age'] = 2024-df['cltdob_fix'].dt.year

df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)
y = df["f_purchase_lh"]
X = df.drop(columns=['f_purchase_lh'])

numeric_cols = X.select_dtypes(include=["int32", "int64", "float64"]).columns
X_numeric = X[numeric_cols]
X_numeric = X_numeric.apply(lambda x: x.fillna(-1))

non_numeric_cols = X.select_dtypes(include=["string", "object"]).columns
non_numeric_keep = []
for col in non_numeric_cols:
    if len(X[col].unique()) <= 5:
        if len(X[col].unique()) > 1:
            non_numeric_keep.append(col)
X_non_numeric = pd.get_dummies(X[non_numeric_keep], dtype=float)

X = pd.concat([X_numeric, X_non_numeric], axis=1)

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.05))
sel.fit(X)
X = X[X.columns.values[sel.get_support()]]

In [20]:
# use SMOTE/adasyn to handle imbalance
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)



Random Forest w grid search

In [28]:
%%time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,precision_score, recall_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

rf_model = RandomForestClassifier(n_estimators=15, random_state=15, class_weight='balanced')

param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Grid Search for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=3)
grid_search_rf.fit(X_train, y_train)

# hyperparameters
best_params_rf = grid_search_rf.best_params_

y_val_pred_rf = grid_search_rf.best_estimator_.predict(X_val)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)
precision_rf = precision_score(y_val, y_val_pred_rf)
recall_rf = recall_score(y_val, y_val_pred_rf)


print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")


Random Forest Results:
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.9401780745687257
Confusion Matrix:
[[3339  109]
 [ 106   40]]
F1 Score: 0.2711864406779661
Precision: 0.2684563758389262
Recall: 0.273972602739726
CPU times: total: 12 s
Wall time: 12.7 s
