In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_parquet('./data/catB_train.parquet')

In [3]:
# Add new feature Age
df = df[df['cltdob_fix']!='None']
df['cltdob_fix'] = pd.to_datetime(df.iloc[:, 6], format ='mixed')
df['age'] = 2024-df['cltdob_fix'].dt.year

In [4]:
# Mapping for replacement for categorical data (not hot encoding)
mapping = {
    None: -1,
    'E.BELOW30K': 0,
    'D.30K-60K': 1,
    'C.60K-100K': 2,
    'B.100K-200K': 3,
    'A.ABOVE200K': 4,
}

# Replace values based on the mapping
df['annual_income_est'] = df['annual_income_est'].replace(mapping)

  df['annual_income_est'] = df['annual_income_est'].replace(mapping)


In [5]:
# Target Column
df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)
y = df["f_purchase_lh"]

# All features 
X = df.drop(columns=['f_purchase_lh'])

In [6]:
# Split numerical and non-numerical columns
numeric_cols = X.select_dtypes(include=["int32", "int64", "float64"]).columns
X_numeric = X[numeric_cols]

In [7]:
# Remove Low-Variance Numerical Variables
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.05))
sel.fit(X_numeric)
X_numeric = X_numeric[numeric_cols[sel.get_support()]]

In [8]:
# Fill null values in numeric columns with the median value
X_numeric = X_numeric.apply(lambda x: x.fillna(x.median()))

In [9]:
# Merge with selected non_categorical values
temp = pd.get_dummies(X[['cltsex_fix', 'stat_flag']], dtype=float)
X = pd.concat([X_numeric, temp, df['age']], axis=1)

In [10]:
# Test whether it's imbalanced case
total_row = len(y)
purchase = sum(y)
non_purchase = total_row - purchase
percentage_of_purchase = (purchase/total_row)*100
print(total_row)
print(purchase)
print(f"{percentage_of_purchase}%")

17970
708.0
3.9398998330550916%


In [13]:
# use SMOTE/adasyn to handle imbalance
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)



# TOP 5 model

random forest with custom threshold

In [11]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score,recall_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
custom_threshold = 0.15  # Adjust as needed


# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0,class_weight='balanced')
rf_model.fit(X_train, y_train)

y_val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
y_val_pred_custom_rf = (y_val_prob_rf > custom_threshold).astype(int)

accuracy_rf = accuracy_score(y_val, y_val_pred_custom_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_custom_rf)
f1_rf = f1_score(y_val, y_val_pred_custom_rf)
precision = precision_score(y_val, y_val_pred_custom_rf)
recall = recall_score(y_val, y_val_pred_custom_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Random Forest Results:
Accuracy: 0.9404563160823595
Confusion Matrix:
[[3332  116]
 [  98   48]]
F1 Score: 0.3096774193548387
Precision: 0.2926829268292683
Recall: 0.3287671232876712
CPU times: total: 3.2 s
Wall time: 3.55 s


with oversampling

In [25]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score,recall_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
custom_threshold = 0.15  # Adjust as needed

smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0,class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_model.fit(X_train_smote, y_train_smote)
rf_model.fit(X_train_adasyn, y_train_adasyn)

y_val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
y_val_pred_custom_rf = (y_val_prob_rf > custom_threshold).astype(int)

accuracy_rf = accuracy_score(y_val, y_val_pred_custom_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_custom_rf)
f1_rf = f1_score(y_val, y_val_pred_custom_rf)
precision = precision_score(y_val, y_val_pred_custom_rf)
recall = recall_score(y_val, y_val_pred_custom_rf)



print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Random Forest Results:
Accuracy: 0.9482470784641068
Confusion Matrix:
[[3271  177]
 [   9  137]]
F1 Score: 0.5956521739130435
Precision: 0.43630573248407645
Recall: 0.9383561643835616
CPU times: total: 7.14 s
Wall time: 8.66 s


Logistic with custom threshold and cross validation

In [15]:
%%time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

logreg_model = LogisticRegression()

# custom threshold
custom_threshold = 0.15

# cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

accuracy_scores = []
conf_matrices = []
f1_scores = []
precision = []
recall = []

# cross-validation
for train_idx, val_idx in cv.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    logreg_model.fit(X_train_fold, y_train_fold)
    y_val_prob = logreg_model.predict_proba(X_val_fold)[:, 1]

    y_val_pred_custom = (y_val_prob > custom_threshold).astype(int)

    accuracy_fold = accuracy_score(y_val_fold, y_val_pred_custom)
    conf_matrix_fold = confusion_matrix(y_val_fold, y_val_pred_custom)
    f1_fold = f1_score(y_val_fold, y_val_pred_custom)
    precision_ = precision_score(y_val_fold,y_val_pred_custom)
    recall_ = recall_score(y_val_fold,y_val_pred_custom)

    accuracy_scores.append(accuracy_fold)
    conf_matrices.append(conf_matrix_fold)
    f1_scores.append(f1_fold)
    precision.append(precision_)
    recall.append(recall_)

print(f"Average Accuracy with Custom Threshold: {sum(accuracy_scores) / len(accuracy_scores)}")
print("Average Confusion Matrix with Custom Threshold:")
print(sum(conf_matrices) / len(conf_matrices))
print(f"Average F1 Score with Custom Threshold: {sum(f1_scores) / len(f1_scores)}")
print(f"precision = {sum(precision)/len(precision)}")
print(f"recall = {sum(recall)/len(recall)}")


Average Accuracy with Custom Threshold: 0.936839305799117
Average Confusion Matrix with Custom Threshold:
[[2665.    97.8]
 [  83.8   28.6]]
Average F1 Score with Custom Threshold: 0.2395318817738703
precision = 0.22679708157057238
recall = 0.2544563843236409
CPU times: total: 1.75 s
Wall time: 3.46 s


with smote

In [16]:
%%time
import warnings
warnings.filterwarnings('ignore')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

logreg_model = LogisticRegression()

# custom threshold
custom_threshold = 0.15

# cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

accuracy_scores = []
conf_matrices = []
f1_scores = []
precision = []
recall = []

# cross-validation
for train_idx, val_idx in cv.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    smote = SMOTE(random_state=0)
    adasyn = ADASYN(random_state=0)

    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)    

    logreg_model.fit(X_train_fold, y_train_fold)
    logreg_model.fit(X_train_smote, y_train_smote)
    logreg_model.fit(X_train_adasyn, y_train_adasyn)
    y_val_prob = logreg_model.predict_proba(X_val_fold)[:, 1]

    y_val_pred_custom = (y_val_prob > custom_threshold).astype(int)

    accuracy_fold = accuracy_score(y_val_fold, y_val_pred_custom)
    conf_matrix_fold = confusion_matrix(y_val_fold, y_val_pred_custom)
    f1_fold = f1_score(y_val_fold, y_val_pred_custom)
    precision_ = precision_score(y_val_fold,y_val_pred_custom)
    recall_ = recall_score(y_val_fold,y_val_pred_custom)

    accuracy_scores.append(accuracy_fold)
    conf_matrices.append(conf_matrix_fold)
    f1_scores.append(f1_fold)
    precision.append(precision_)
    recall.append(recall_)

print(f"Average Accuracy with Custom Threshold: {sum(accuracy_scores) / len(accuracy_scores)}")
print("Average Confusion Matrix with Custom Threshold:")
print(sum(conf_matrices) / len(conf_matrices))
print(f"Average F1 Score with Custom Threshold: {sum(f1_scores) / len(f1_scores)}")
print(f"precision = {sum(precision)/len(precision)}")
print(f"recall = {sum(recall)/len(recall)}")


Average Accuracy with Custom Threshold: 0.38689554332708476
Average Confusion Matrix with Custom Threshold:
[[1012.  1750.8]
 [  12.   100.4]]
Average F1 Score with Custom Threshold: 0.10229361516200805
precision = 0.05425359828072203
recall = 0.8932838179519595
CPU times: total: 11.8 s
Wall time: 14.9 s


3. Random Forest w grid search

In [19]:
%%time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Grid Search for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=3)
grid_search_rf.fit(X_train, y_train)

# hyperparameters
best_params_rf = grid_search_rf.best_params_

y_val_pred_rf = grid_search_rf.best_estimator_.predict(X_val)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)
precision_rf = precision_score(y_val, y_val_pred_rf)
recall_rf = recall_score(y_val, y_val_pred_rf)


print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")


Random Forest Results:
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy: 0.9062326099053979
Confusion Matrix:
[[3195  253]
 [  84   62]]
F1 Score: 0.26898047722342733
Precision: 0.19682539682539682
Recall: 0.4246575342465753
CPU times: total: 2min 26s
Wall time: 2min 36s


with sampling

4. Random Forest with random search

In [21]:
%%time
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,precision_score, recall_score
from scipy.stats import randint

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

param_dist = {
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Random Search for Random Forest
random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, scoring='f1', cv=3, n_iter=10, random_state=0)
random_search_rf.fit(X_train, y_train)

# hyperparameters
best_params_rf = random_search_rf.best_params_
y_val_pred_rf = random_search_rf.best_estimator_.predict(X_val)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)
precision_rf = precision_score(y_val, y_val_pred_rf)
recall_rf = recall_score(y_val, y_val_pred_rf)

print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")


Random Forest Results:
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 7}
Accuracy: 0.9012242626599889
Confusion Matrix:
[[3173  275]
 [  80   66]]
F1 Score: 0.27104722792607805
Precision: 0.1935483870967742
Recall: 0.4520547945205479
CPU times: total: 54.3 s
Wall time: 58.8 s


In [22]:
%%time
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,precision_score, recall_score
from scipy.stats import randint

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_train, y_train)

# Combine resampled datasets
X_combined_resampled = pd.concat([X_resampled_smote, X_resampled_adasyn])
y_combined_resampled = pd.concat([y_resampled_smote, y_resampled_adasyn])

rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

param_dist = {
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Random Search for Random Forest
random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, scoring='f1', cv=3, n_iter=10, random_state=0)
random_search_rf.fit(X_combined_resampled, y_combined_resampled)

# hyperparameters
best_params_rf = random_search_rf.best_params_
y_val_pred_rf = random_search_rf.best_estimator_.predict(X_val)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)
precision_rf = precision_score(y_val, y_val_pred_rf)
recall_rf = recall_score(y_val, y_val_pred_rf)

print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")


Random Forest Results:
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3}
Accuracy: 0.9546466332776851
Confusion Matrix:
[[3416   32]
 [ 131   15]]
F1 Score: 0.15544041450777202
Precision: 0.3191489361702128
Recall: 0.10273972602739725
CPU times: total: 3min 4s
Wall time: 3min 34s
