In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_parquet('./data/catB_train.parquet')

In [2]:
# Add new feature Age
df = df[df['cltdob_fix']!='None']
df['cltdob_fix'] = pd.to_datetime(df.iloc[:, 6], format ='mixed')
df['age'] = 2024-df['cltdob_fix'].dt.year

In [3]:
# Mapping for replacement for categorical data (not hot encoding)
mapping = {
    None: -1,
    'E.BELOW30K': 0,
    'D.30K-60K': 1,
    'C.60K-100K': 2,
    'B.100K-200K': 3,
    'A.ABOVE200K': 4,
}

# Replace values based on the mapping
df['annual_income_est'] = df['annual_income_est'].replace(mapping)

  df['annual_income_est'] = df['annual_income_est'].replace(mapping)


In [4]:
# Target Column
df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)
y = df["f_purchase_lh"]

# All features 
X = df.drop(columns=['f_purchase_lh'])

In [5]:
# Split numerical and non-numerical columns
numeric_cols = X.select_dtypes(include=["int32", "int64", "float64"]).columns
X_numeric = X[numeric_cols]

In [6]:
# Remove Low-Variance Numerical Variables
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.05))
sel.fit(X_numeric)
X_numeric = X_numeric[numeric_cols[sel.get_support()]]

In [7]:
# Fill null values in numeric columns with the median value
X_numeric = X_numeric.apply(lambda x: x.fillna(x.median()))

In [8]:
# Merge with selected non_categorical values
temp = pd.get_dummies(X[['cltsex_fix', 'stat_flag']], dtype=float)
X = pd.concat([X_numeric, temp, df['age']], axis=1)

In [9]:
# Test whether it's imbalanced case
total_row = len(y)
purchase = sum(y)
non_purchase = total_row - purchase
percentage_of_purchase = (purchase/total_row)*100
print(total_row)
print(purchase)
print(f"{percentage_of_purchase}%")

17970
708.0
3.9398998330550916%


In [14]:
# use SMOTE/adasyn to handle imbalance
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)



Before: Counter({0.0: 13814, 1.0: 562})
After: Counter({0.0: 13843, 1.0: 13776})


# Logistic regression

1. compare different feature ranking method

** this code take extremely long

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,precision_score, recall_score
import warnings

warnings.filterwarnings('ignore')

# a. mutual info
from sklearn.feature_selection import mutual_info_classif

mutual_info_values = mutual_info_classif(X, y, discrete_features=[False]*X.shape[1])
threshold = 0.005
selected_features_mutualinfo = X.columns[mutual_info_values > threshold]

# b. RFE
from sklearn.feature_selection import RFE

classifier = LogisticRegression(max_iter=1000)

# Initialize RFE with the classifier and the number of features to retain
num_features_to_keep = 25
rfe = RFE(estimator=classifier, n_features_to_select=num_features_to_keep)

# Fit RFE to the training data
rfe.fit(X_train, y_train)

# Get the selected features
selected_features_rfe = X_train.columns[rfe.support_]


# train model using smote and adasyn
def train(selected_features):
    X_new = X[selected_features]
    
    X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=0.2, random_state=0)
    
    smote = SMOTE(random_state=0)
    X_resampled_smote, y_resampled_smote = smote.fit_resample(X_new, y)

    adasyn = ADASYN(random_state=0)
    X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_new, y)
    
    X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
    X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)

    logreg_model = LogisticRegression(max_iter=1000)

    logreg_model.fit(X_train, y_train)
    logreg_model.fit(X_train_smote, y_train_smote)
    logreg_model.fit(X_train_adasyn, y_train_adasyn)
    
    y_pred = logreg_model.predict(X_val)
    
    accuracy_original = accuracy_score(y_val, y_pred)
    conf_matrix_original = confusion_matrix(y_val, y_pred)
    f1_original = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)

    print(f"Accuracy: {accuracy_original}")
    print("Confusion Matrix:")
    print(conf_matrix_original)
    print(f"F1 Score: {f1_original}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

# compare result
print("Original data:")
train(X.columns)
print()
print("filtered_mutual_info:")
train(selected_features_mutualinfo)
print()
print("filtered_rfe:")
train(selected_features_rfe)

Original data:
Accuracy: 0.800222593210907
Confusion Matrix:
[[2811  637]
 [  81   65]]
F1 Score: 0.15330188679245282
Precision: 0.09259259259259259
Recall: 0.4452054794520548

filtered_mutual_info:
Accuracy: 0.6457985531441292
Confusion Matrix:
[[2222 1226]
 [  47   99]]
F1 Score: 0.13460231135282122
Precision: 0.07471698113207548
Recall: 0.678082191780822

filtered_rfe:
Accuracy: 0.6477462437395659
Confusion Matrix:
[[2222 1226]
 [  40  106]]
F1 Score: 0.14343707713125844
Precision: 0.07957957957957958
Recall: 0.726027397260274


2. k-fold cross validation

In [30]:
from sklearn.model_selection import KFold

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Lists to store performance metrics for Decision Tree
accuracy_scores = []
conf_matrix_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform K-fold cross-validation for both models
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    smote = SMOTE(random_state=0)
    adasyn = ADASYN(random_state=0)

    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    X_train_adasyn, y_train_adasyn = smote.fit_resample(X_train, y_train)    

    logreg_model = LogisticRegression(max_iter=1000)

    logreg_model.fit(X_train, y_train)
    logreg_model.fit(X_train_smote, y_train_smote)
    logreg_model.fit(X_train_adasyn, y_train_adasyn)

    y_pred = logreg_model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)

    accuracy_scores.append(accuracy)
    conf_matrix_scores.append(conf_matrix)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

    average_accuracy = sum(accuracy_scores) / num_folds
    average_conf_matrix = sum(conf_matrix_scores) / num_folds
    average_f1 = sum(f1_scores) / num_folds
    average_precision = sum(precision_scores) / num_folds
    average_recall = sum(recall_scores) / num_folds

    print("Results:")
    print(f"Average Accuracy: {average_accuracy}")
    print("Average Confusion Matrix:")
    print(average_conf_matrix)
    print(f"Average F1 Score: {average_f1}")
    print(f"Average precision Score: {average_precision}")
    print(f"Average recall Score: {average_recall}")
    print()


Results:
Average Accuracy: 0.16160267111853088
Average Confusion Matrix:
[[568.6 121.2]
 [ 16.8  12.2]]
Average F1 Score: 0.030049261083743846
Average precision Score: 0.018290854572713643
Average recall Score: 0.08413793103448276

Results:
Average Accuracy: 0.32487479131886476
Average Confusion Matrix:
[[1140.6  239. ]
 [  31.    27. ]]
Average F1 Score: 0.06668292445008048
Average precision Score: 0.040613629836665374
Average recall Score: 0.18620689655172415

Results:
Average Accuracy: 0.4882025598219254
Average Confusion Matrix:
[[1718.8  354.4]
 [  47.4   35.8]]
Average F1 Score: 0.09024383475797874
Average precision Score: 0.05478432226822737
Average recall Score: 0.256048166392994

Results:
Average Accuracy: 0.6506956037840845
Average Confusion Matrix:
[[2290.6  472.6]
 [  64.    48. ]]
Average F1 Score: 0.12089710108963705
Average precision Score: 0.07349597870994516
Average recall Score: 0.3407703886152162

Results:
Average Accuracy: 0.8100723427935448
Average Confusion Matrix