In [7]:
import pandas as pd
df = pd.read_parquet('./data/catB_train.parquet')

In [26]:
df = df[df['cltdob_fix']!='None']
df['cltdob_fix'] = pd.to_datetime(df.iloc[:, 6], format ='mixed')
df['age'] = 2024-df['cltdob_fix'].dt.year

df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)
y = df["f_purchase_lh"]
X = df.drop(columns=['f_purchase_lh'])

numeric_cols = X.select_dtypes(include=["int32", "int64", "float64"]).columns
X_numeric = X[numeric_cols]
X_numeric = X_numeric.apply(lambda x: x.fillna(-1))

non_numeric_cols = X.select_dtypes(include=["string", "object"]).columns
non_numeric_keep = []
for col in non_numeric_cols:
    if len(X[col].unique()) <= 5:
        if len(X[col].unique()) > 1:
            non_numeric_keep.append(col)
X_non_numeric = pd.get_dummies(X[non_numeric_keep], dtype=float)

X = pd.concat([X_numeric, X_non_numeric], axis=1)

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.05))
sel.fit(X)
X = X[X.columns.values[sel.get_support()]]

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)

In [28]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [42]:
bbc = BalancedBaggingClassifier(n_estimators=20, random_state=20)
bbc.fit(X_ros, y_ros)
y_val_pred = bbc.predict(X_val)
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("\nBalanced Bagging Classifier with ROS Results:")
print(conf_matrix)
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Precision: {precision_score(y_val, y_val_pred)}")
print(f"Recall: {recall_score(y_val, y_val_pred)}")
print(f"F1 Score: {f1_score(y_val, y_val_pred)}")


Balanced Bagging Classifier with ROS Results:
[[3416   39]
 [ 111   28]]
Accuracy: 0.9582637729549248
Precision: 0.417910447761194
Recall: 0.2014388489208633
F1 Score: 0.27184466019417475


In [33]:
bbc_rf = BalancedBaggingClassifier(estimator=RandomForestClassifier(random_state=0), n_estimators=20, random_state=0)
bbc_rf.fit(X_ros, y_ros)
y_val_pred_rf = bbc_rf.predict(X_val)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
print("\nBalanced Bagging Classifier with ROS Results:")
print(conf_matrix_rf)
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_rf)}")
print(f"Precision: {precision_score(y_val, y_val_pred_rf)}")
print(f"Recall: {recall_score(y_val, y_val_pred_rf)}")
print(f"F1 Score: {f1_score(y_val, y_val_pred_rf)}")


Balanced Bagging Classifier with ROS Results:
[[3431   24]
 [ 113   26]]
Accuracy: 0.9618809126321647
Precision: 0.52
Recall: 0.18705035971223022
F1 Score: 0.2751322751322751


In [40]:
bbc_dt = BalancedBaggingClassifier(estimator=DecisionTreeClassifier(random_state=0), n_estimators=20, random_state=0)
bbc_dt.fit(X_ros, y_ros)
y_val_pred_dt = bbc_dt.predict(X_val)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_dt)
print("\nBalanced Bagging Classifier with ROS Results:")
print(conf_matrix_dt)
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_dt)}")
print(f"Precision: {precision_score(y_val, y_val_pred_dt)}")
print(f"Recall: {recall_score(y_val, y_val_pred_dt)}")
print(f"F1 Score: {f1_score(y_val, y_val_pred_dt)}")


Balanced Bagging Classifier with ROS Results:
[[3419   36]
 [ 118   21]]
Accuracy: 0.9571508069003896
Precision: 0.3684210526315789
Recall: 0.1510791366906475
F1 Score: 0.21428571428571427
