# I. Preface

Objective: This document was made to build a classification model that predict the response of a customer.

---

# II. Libraries

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Statistical analysis
import scipy.stats as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson

In [4]:
# Machine learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
import optuna

In [5]:
# Preprocessing
from imblearn.over_sampling import SMOTENC, SMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder

In [6]:
# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline

In [7]:
# Timing
import time
import datetime as dt

In [8]:
# Serialization
import pickle
import json

In [9]:
# disable warnings
import warnings
warnings.filterwarnings('ignore') #turn off warning

---

# III. Data Loading

In [10]:
# Loading the cleaned csv dataset

df = pd.read_csv("data_cleaned.csv", index_col=False)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,...,num_catalog_purchases,num_store_purchases,num_web_visits_month,accepted_cmp3,accepted_cmp4,accepted_cmp5,accepted_cmp1,accepted_cmp2,complain,response
0,0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,...,10,4,7,0,0,0,0,0,0,1
1,1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,...,1,2,5,0,0,0,0,0,0,0
2,2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,...,2,10,4,0,0,0,0,0,0,0
3,3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,...,0,4,6,0,0,0,0,0,0,0
4,4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,...,3,6,5,0,0,0,0,0,0,0


dataset successfully loaded.

---

# IV. Feature Engineering

## 1. Feature Creation

In [12]:
# Create new features from amount features
df["total_mnt"] = df["mnt_wines"] + df["mnt_fruits"] + df["mnt_meat_products"] + df["mnt_fish_products"] + df["mnt_sweet_products"] + df["mnt_gold_prods"]

In [13]:
# Filter out rows where year_birth is less than 1928
df = df[df["year_birth"] >= 1928]

# Define generation labels and ranges
generations = {
    "Silent Generation": (1928, 1945),
    "Baby Boomers": (1946, 1964),
    "Generation X": (1965, 1980),
    "Millennials": (1981, 1996),
    "Generation Z": (1997, 2012),
    "Generation Alpha": (2013, 2025)
}

# Create a function to assign generation label
def assign_generation(year):
    for gen, (start, end) in generations.items():
        if start <= year <= end:
            return gen
    return "Unknown"

# Apply the function to the year_birth feature
df["generation"] = df["year_birth"].apply(assign_generation)

In [14]:
df["dt_customer"] = pd.to_datetime(df["dt_customer"], format="%d-%m-%Y")

# Create new features from date features
df["customer_since"] = (dt.datetime(2015, 1, 1) - df["dt_customer"]).dt.days
df['year_dt_customer'] = df['dt_customer'].dt.year
df['month_dt_customer'] = df['dt_customer'].dt.month
df['day_dt_customer'] = df['dt_customer'].dt.day

In [15]:
# Drop unnecessary columns
df = df.drop(["Unnamed: 0", "id", "dt_customer"], axis=1) ## these columns won't help the model

In [16]:
df.head()

Unnamed: 0,year_birth,education,marital_status,income,kidhome,teenhome,recency,mnt_wines,mnt_fruits,mnt_meat_products,...,accepted_cmp1,accepted_cmp2,complain,response,total_mnt,generation,customer_since,year_dt_customer,month_dt_customer,day_dt_customer
0,1957,Graduation,Single,58138.0,0,0,58,635,88,546,...,0,0,0,1,1617,Baby Boomers,849,2012,9,4
1,1954,Graduation,Single,46344.0,1,1,38,11,1,6,...,0,0,0,0,27,Baby Boomers,299,2014,3,8
2,1965,Graduation,Together,71613.0,0,0,26,426,49,127,...,0,0,0,0,776,Generation X,498,2013,8,21
3,1984,Graduation,Together,26646.0,1,0,26,11,4,20,...,0,0,0,0,53,Millennials,325,2014,2,10
4,1981,PhD,Married,58293.0,1,0,94,173,43,118,...,0,0,0,0,422,Millennials,347,2014,1,19


## 2. Feature Selection

In [17]:
# Define the list of numerical columns
num_col = ['year_birth', 'income', 'kidhome', 'teenhome','recency','mnt_wines','mnt_fruits',
        'mnt_meat_products','mnt_fish_products','mnt_sweet_products','mnt_gold_prods',
        'num_deals_purchases','num_web_purchases','num_catalog_purchases',
        'num_store_purchases','num_web_visits_month', "total_mnt", "customer_since"]

# Create a new DataFrame
cor_num = pd.DataFrame(index=num_col, columns=["corr", "pval", "has_relation"])

# Loop through the numerical columns
for col in num_col:
    # Calculate the Kendall's tau correlation
    corr_tau, pval_k = st.kendalltau(df[col], df['response'])
    # Set the values in the DataFrame
    cor_num.loc[col, "corr"] = corr_tau
    cor_num.loc[col, "pval"] = pval_k
    cor_num.loc[col, "has_relation"] = pval_k < 0.05


In [18]:
cor_num

Unnamed: 0,corr,pval,has_relation
year_birth,0.017844,0.30981,False
income,0.131849,0.0,True
kidhome,-0.07483,0.00038,True
teenhome,-0.157378,0.0,True
recency,-0.163863,0.0,True
mnt_wines,0.162214,0.0,True
mnt_fruits,0.12301,0.0,True
mnt_meat_products,0.182119,0.0,True
mnt_fish_products,0.094777,0.0,True
mnt_sweet_products,0.113119,0.0,True


In [19]:
# Define the list of categorical columns
cat_col = ['education','marital_status',
        'accepted_cmp1','accepted_cmp2','accepted_cmp3',
        'accepted_cmp4','accepted_cmp5','complain', 'generation',
        'year_dt_customer','month_dt_customer','day_dt_customer']

# Create a new DataFrame for categorical correlations
cor_cat = pd.DataFrame(index=cat_col, columns=["pval", "has_relation"])

# Loop through the categorical columns
for col in cat_col:
    # Create a contingency table
    contingency_table = pd.crosstab(df[col], df['response'])
    
    # Calculate the chi-square test
    chi2, pval, _, _ = st.chi2_contingency(contingency_table)
    
    # Set the values in the DataFrame
    cor_cat.loc[col, "pval"] = pval
    cor_cat.loc[col, "has_relation"] = pval < 0.05


In [20]:
cor_cat

Unnamed: 0,pval,has_relation
education,0.000119,True
marital_status,0.0,True
accepted_cmp1,0.0,True
accepted_cmp2,0.0,True
accepted_cmp3,0.0,True
accepted_cmp4,0.0,True
accepted_cmp5,0.0,True
complain,1.0,False
generation,0.018396,True
year_dt_customer,0.0,True


Here are the columns that have significant relation with response, based on the correlation analysis done in EDA step:

In [21]:
# Numerical columns
true_num_columns = cor_num[cor_num['has_relation'] == True].index.tolist()

# Categorical columns
true_cat_columns = cor_cat[cor_cat['has_relation'] == True].index.tolist()

# Combine the lists
col_to_keep = true_num_columns + true_cat_columns
col_to_keep

['income',
 'kidhome',
 'teenhome',
 'recency',
 'mnt_wines',
 'mnt_fruits',
 'mnt_meat_products',
 'mnt_fish_products',
 'mnt_sweet_products',
 'mnt_gold_prods',
 'num_web_purchases',
 'num_catalog_purchases',
 'num_store_purchases',
 'total_mnt',
 'customer_since',
 'education',
 'marital_status',
 'accepted_cmp1',
 'accepted_cmp2',
 'accepted_cmp3',
 'accepted_cmp4',
 'accepted_cmp5',
 'generation',
 'year_dt_customer',
 'month_dt_customer']

In [22]:
df_selected = df[col_to_keep]
df_selected.head()

Unnamed: 0,income,kidhome,teenhome,recency,mnt_wines,mnt_fruits,mnt_meat_products,mnt_fish_products,mnt_sweet_products,mnt_gold_prods,...,education,marital_status,accepted_cmp1,accepted_cmp2,accepted_cmp3,accepted_cmp4,accepted_cmp5,generation,year_dt_customer,month_dt_customer
0,58138.0,0,0,58,635,88,546,172,88,88,...,Graduation,Single,0,0,0,0,0,Baby Boomers,2012,9
1,46344.0,1,1,38,11,1,6,2,1,6,...,Graduation,Single,0,0,0,0,0,Baby Boomers,2014,3
2,71613.0,0,0,26,426,49,127,111,21,42,...,Graduation,Together,0,0,0,0,0,Generation X,2013,8
3,26646.0,1,0,26,11,4,20,10,3,5,...,Graduation,Together,0,0,0,0,0,Millennials,2014,2
4,58293.0,1,0,94,173,43,118,46,27,15,...,PhD,Married,0,0,0,0,0,Millennials,2014,1


## 3. Split Data

In [23]:
# separate X and y
X = df_selected
y = df['response']

In [24]:
y.value_counts()

response
0    1880
1     333
Name: count, dtype: int64

very imbalanced.

In [25]:
# split data train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify=y, random_state=2)

print('Train Size :', X_train.shape)
print('Test Size :', X_test.shape)
print('Train Size :', y_train.shape)
print('Test Size :', y_test.shape) 

Train Size : (1549, 25)
Test Size : (664, 25)
Train Size : (1549,)
Test Size : (664,)


In [26]:
# split based on data type
X_train_cat = X_train[['education',
                    'marital_status',
                    'accepted_cmp1',
                    'accepted_cmp2',
                    'accepted_cmp3',
                    'accepted_cmp4',
                    'accepted_cmp5',
                    'generation',
                    'year_dt_customer',
                    'month_dt_customer']]
X_train_num = X_train[['income',
                    'kidhome',
                    'teenhome',
                    'recency',
                    'mnt_wines',
                    'mnt_fruits',
                    'mnt_meat_products',
                    'mnt_fish_products',
                    'mnt_sweet_products',
                    'mnt_gold_prods',
                    'num_web_purchases',
                    'num_catalog_purchases',
                    'num_store_purchases',
                    'total_mnt',
                    'customer_since',]]

## 4. Outlier Handling

Outliers or extreme values can affect the mean of the data by very significantly, as some models are not robust towards outliers. Before detecting the outliers, we need to know the skewness of each columns to decide the method to use to identify outliers, as each type have very different distributions.

In [27]:
# menggunakan lambda, calculate skewness for each column
skewness = X_train_num.apply(lambda x: st.skew(x))

# Separate columns into skewed and normal based on the threshold (0.5)
col_s = skewness[abs(skewness) > 0.5].index.tolist()
col_n = skewness[abs(skewness) <= 0.5].index.tolist()

# print separated columns
print("Skewed Columns:", col_s)
print("Normal Columns:", col_n)

Skewed Columns: ['kidhome', 'mnt_wines', 'mnt_fruits', 'mnt_meat_products', 'mnt_fish_products', 'mnt_sweet_products', 'mnt_gold_prods', 'num_web_purchases', 'num_catalog_purchases', 'num_store_purchases', 'total_mnt']
Normal Columns: ['income', 'teenhome', 'recency', 'customer_since']


### 4.1. Normal Columns

In [28]:
# SKIP DULU

### 4.2. Skewed Columns

In [29]:
# SKIP DULU

# 5. Oversampling

In [30]:
# oversampling_step = SMOTENC([15, 16, 17, 18, 19, 20, 21, 22, 23, 24], k_neighbors=10, random_state=10, n_jobs=-1)
# X_train_balanced, y_train_balanced = oversampling_step.fit_resample(X_train, y_train)
# y_train_balanced.value_counts()

## 6. Feature Scaling

In [31]:
scaler_n = StandardScaler()
scaler_n.fit(X_train_num)

In [32]:
X_train_num_scaled = scaler_n.transform(X_train_num)
X_test_num_scaled = scaler_n.transform(X_test[['income',
                    'kidhome',
                    'teenhome',
                    'recency',
                    'mnt_wines',
                    'mnt_fruits',
                    'mnt_meat_products',
                    'mnt_fish_products',
                    'mnt_sweet_products',
                    'mnt_gold_prods',
                    'num_web_purchases',
                    'num_catalog_purchases',
                    'num_store_purchases',
                    'total_mnt',
                    'customer_since',]])

## 7. Feature Encoding

In [33]:
for col in X_train_cat.columns:
    print(col)
    print(X_train_cat[col].unique())

education
['Graduation' '2n Cycle' 'PhD' 'Master' 'Basic']
marital_status
['Married' 'Single' 'Together' 'Divorced' 'Widow' 'Alone' 'YOLO' 'Absurd']
accepted_cmp1
[0 1]
accepted_cmp2
[0 1]
accepted_cmp3
[0 1]
accepted_cmp4
[0 1]
accepted_cmp5
[0 1]
generation
['Generation X' 'Baby Boomers' 'Millennials' 'Silent Generation']
year_dt_customer
[2013 2014 2012]
month_dt_customer
[ 6 10  4 12  9 11  8  1  2  3  5  7]


In [34]:
# Apply the mappings to the entire X_test DataFrame
X_test_cat = X_test[['education',
                    'marital_status',
                    'accepted_cmp1',
                    'accepted_cmp2',
                    'accepted_cmp3',
                    'accepted_cmp4',
                    'accepted_cmp5',
                    'generation',
                    'year_dt_customer',
                    'month_dt_customer']]

In [36]:
def encoder(df):
    # Define the mappings for each variable
    education_mapping = {'PhD': 0, 'Basic': 1, 'Graduation': 2, 'Master': 3, '2n Cycle': 4}
    marital_status_mapping = {'Together': 0, 'Married': 1, 'Single': 2, 'Divorced': 3, 'Widow': 4, 'Alone': 5, 'YOLO': 6, 'Absurd': 7}
    accepted_cmp_mapping = {0: 0, 1: 1}
    generation_mapping = {'Silent Generation':0, 'Baby Boomers': 1, 'Millennials': 2, 'Generation X': 3}
    year_dt_customer_mapping = {2012: 0, 2013: 1, 2014: 2}
    month_dt_customer_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11}

    # Apply the mappings to the DataFrame
    df['education'] = df['education'].map(education_mapping)
    df['marital_status'] = df['marital_status'].map(marital_status_mapping)
    df['accepted_cmp1'] = df['accepted_cmp1'].map(accepted_cmp_mapping)
    df['accepted_cmp2'] = df['accepted_cmp2'].map(accepted_cmp_mapping)
    df['accepted_cmp3'] = df['accepted_cmp3'].map(accepted_cmp_mapping)
    df['accepted_cmp4'] = df['accepted_cmp4'].map(accepted_cmp_mapping)
    df['accepted_cmp5'] = df['accepted_cmp5'].map(accepted_cmp_mapping)
    df['generation'] = df['generation'].map(generation_mapping)
    df['year_dt_customer'] = df['year_dt_customer'].map(year_dt_customer_mapping)
    df['month_dt_customer'] = df['month_dt_customer'].map(month_dt_customer_mapping)

    return df

# Apply the function to the training and test data
X_train_cat = encoder(X_train_cat)
X_test_cat = encoder(X_test_cat)

## 8. Concat Data

In [37]:
X_train_final = np.concatenate([X_train_num_scaled, X_train_cat],axis=1)
X_test_final = np.concatenate([X_test_num_scaled, X_test_cat],axis=1)

---

# V. Model Definiton and Training

## 1. KNN

In [40]:
knn_model = KNeighborsClassifier()

# Fit on the data
knn_model.fit(X_train_final, y_train)

In [41]:
# Predict on the training data
y_train_pred = knn_model.predict(X_train_final)

# Print the classification report for the training data
print("Classification Report for Training Data:")
print(classification_report(y_train, y_train_pred))

Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1316
           1       0.83      0.37      0.51       233

    accuracy                           0.89      1549
   macro avg       0.86      0.68      0.73      1549
weighted avg       0.89      0.89      0.88      1549



In [42]:
# Predict on the test data
y_test_pred = knn_model.predict(X_test_final)

# Print the classification report for the test data
print("Classification Report for Test Data:")
print(classification_report(y_test, y_test_pred))

Classification Report for Test Data:
              precision    recall  f1-score   support

           0       0.87      0.96      0.92       564
           1       0.49      0.19      0.27       100

    accuracy                           0.85       664
   macro avg       0.68      0.58      0.59       664
weighted avg       0.81      0.85      0.82       664



In [43]:
# Print the confusion matrix for the test data
print("Confusion Matrix for Test Data:")
print(confusion_matrix(y_test, y_test_pred))

Confusion Matrix for Test Data:
[[544  20]
 [ 81  19]]


---

# VI. Model Evaluation

In [None]:
best_model = knn_model

---

# VII. Model Saving

In [44]:
# Save the trained KNN model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the fitted StandardScaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler_n, f)