In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

import optuna
import optuna.visualization as vis
import time

import scipy.stats as st
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("../data/clean/cleaned_superstore_data.csv")  
df.head()

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,response,complain,age,Total_Spending,Age_Group,Total_Purchases,mnttotal,YearMonth
0,1826,1970,Bachelor,Divorced,84835.0,0,0,2014-06-16,0,189,104,379,111,189,218,1,4,4,6,1,1,0,55,1190,50s,14,1190,2014-06
1,1,1961,Bachelor,Single,57091.0,0,0,2014-06-15,0,464,5,64,7,0,37,1,7,3,7,5,1,0,64,577,60s,17,577,2014-06
2,10476,1958,Bachelor,Married,67267.0,0,1,2014-05-13,0,134,11,59,15,2,30,1,3,2,5,2,0,0,67,251,60s,10,251,2014-05
3,1386,1967,Bachelor,Single,32474.0,1,1,2014-11-05,0,10,0,1,0,0,0,1,1,0,2,7,0,0,58,11,50s,3,11,2014-11
4,5371,1989,Bachelor,Single,21474.0,1,0,2014-08-04,0,6,16,24,11,0,34,2,3,1,2,7,1,0,36,91,30s,6,91,2014-08


In [3]:
df.columns = df.columns.str.lower()

In [4]:
df = df.dropna()
df.isna().sum()

id                     0
year_birth             0
education              0
marital_status         0
income                 0
kidhome                0
teenhome               0
dt_customer            0
recency                0
mntwines               0
mntfruits              0
mntmeatproducts        0
mntfishproducts        0
mntsweetproducts       0
mntgoldprods           0
numdealspurchases      0
numwebpurchases        0
numcatalogpurchases    0
numstorepurchases      0
numwebvisitsmonth      0
response               0
complain               0
age                    0
total_spending         0
age_group              0
total_purchases        0
mnttotal               0
yearmonth              0
dtype: int64

In [5]:
df.drop(["total_spending","age_group","total_purchases","mnttotal",	"yearmonth"], axis=1, inplace=True)
df

Unnamed: 0,id,year_birth,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,response,complain,age
0,1826,1970,Bachelor,Divorced,84835.0,0,0,2014-06-16,0,189,104,379,111,189,218,1,4,4,6,1,1,0,55
1,1,1961,Bachelor,Single,57091.0,0,0,2014-06-15,0,464,5,64,7,0,37,1,7,3,7,5,1,0,64
2,10476,1958,Bachelor,Married,67267.0,0,1,2014-05-13,0,134,11,59,15,2,30,1,3,2,5,2,0,0,67
3,1386,1967,Bachelor,Single,32474.0,1,1,2014-11-05,0,10,0,1,0,0,0,1,1,0,2,7,0,0,58
4,5371,1989,Bachelor,Single,21474.0,1,0,2014-08-04,0,6,16,24,11,0,34,2,3,1,2,7,1,0,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2193,10142,1976,PhD,Divorced,66476.0,0,1,2013-07-03,99,372,18,126,47,48,78,2,5,2,11,4,0,0,49
2194,5263,1977,Master,Married,31056.0,1,0,2013-01-22,99,5,10,13,3,8,16,1,1,0,3,8,0,0,48
2195,22,1976,Bachelor,Divorced,46310.0,1,0,2012-03-12,99,185,2,88,15,5,14,2,6,1,5,8,0,0,49
2196,528,1978,Bachelor,Married,65819.0,0,0,2012-11-29,99,267,38,701,149,165,63,1,5,4,10,3,0,0,47


In [6]:
df["dt_customer"] = pd.to_datetime(df["dt_customer"], format="%Y-%m-%d")

In [8]:
#pairplot_columns = []
#sns.pairplot(df[pairplot_columns], hue= "response")
#plt.show()

In [9]:
features = df.drop(columns=["response","id","year_birth","dt_customer"])
target = df["response"]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [12]:
#Label encoding of the categorical columns

In [13]:
education_order = df.groupby("education")[["response"]].mean().sort_values(by="response", ascending=True)
education_order

Unnamed: 0_level_0,response
education,Unnamed: 1_level_1
Basic,0.037037
Bachelor,0.135669
Master,0.139535
PhD,0.209746


In [14]:
marital_status_order = df.groupby("marital_status")[["response"]].mean().sort_values(by="response", ascending=True)
marital_status_order

Unnamed: 0_level_0,response
marital_status,Unnamed: 1_level_1
Married,0.114118
Single,0.16092
Divorced,0.201754
Widow,0.243243
NI,0.5


In [15]:
numerical_columns = X_train.select_dtypes(include=['int64','float64']).columns
categorical_columns= X_train.select_dtypes(include=["object"]).columns


educational_level_order = {"Basic": 1, 
                           "Bachelor": education_order.iloc[1,0] / education_order.iloc[0,0],
                          "Master": education_order.iloc[2,0] / education_order.iloc[0,0],
                          "PhD": education_order.iloc[3,0] / education_order.iloc[0,0]}


X_train["education"] = X_train["education"].map(educational_level_order)
X_test["education"] = X_test["education"].map(educational_level_order)

marital_status_level = {"Married": 1,
                        "Single": marital_status_order.iloc[1,0] / marital_status_order.iloc[0,0],
                        "Divorced": marital_status_order.iloc[2,0] / marital_status_order.iloc[0,0],
                        "Widow": marital_status_order.iloc[3,0] / marital_status_order.iloc[0,0],
                        "NI": marital_status_order.iloc[4,0] / marital_status_order.iloc[0,0]}

X_train["marital_status"] = X_train["marital_status"].map(marital_status_level)
X_test["marital_status"] = X_test["marital_status"].map(marital_status_level)

In [16]:
X_train.isna().sum()

education              0
marital_status         0
income                 0
kidhome                0
teenhome               0
recency                0
mntwines               0
mntfruits              0
mntmeatproducts        0
mntfishproducts        0
mntsweetproducts       0
mntgoldprods           0
numdealspurchases      0
numwebpurchases        0
numcatalogpurchases    0
numstorepurchases      0
numwebvisitsmonth      0
complain               0
age                    0
dtype: int64

In [17]:
X_train_final = pd.concat([X_train[numerical_columns], X_train[["education","marital_status"]]], axis=1)
X_test_final = pd.concat([X_test[numerical_columns], X_test[["education","marital_status"]]], axis=1)

### Oversampling

In [18]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_final, y_train)
#X_test_resampled, y_test_resampled = ros.fit_resample(X_test_final, y_test)

In [19]:
y_train_resampled.value_counts()

response
0    1496
1    1496
Name: count, dtype: int64

In [20]:
normalizer = MinMaxScaler()

In [21]:
normalizer.fit(X_train_resampled)

In [22]:
# save the nomralizer with pickle in the "../scalers/" folder

In [23]:
X_train_resampled.columns

Index(['income', 'kidhome', 'teenhome', 'recency', 'mntwines', 'mntfruits',
       'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts',
       'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
       'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
       'complain', 'age', 'education', 'marital_status'],
      dtype='object')

In [24]:
X_test = X_test[X_train_resampled.columns]

In [25]:
X_train_resampled_norm = normalizer.transform(X_train_resampled)
X_test_norm = normalizer.transform(X_test)

In [26]:
X_train_resampled_norm_df = pd.DataFrame(X_train_resampled_norm, columns=X_train_resampled.columns, index=X_train_resampled.index )
X_train_resampled_norm_df.head()

Unnamed: 0,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,complain,age,education,marital_status
0,0.422806,0.0,0.0,0.808081,0.439384,0.080402,0.061449,0.123552,0.061069,0.030534,0.066667,0.259259,0.107143,0.923077,0.15,0.0,0.313725,0.593472,0.0
1,0.279336,0.0,0.0,0.040404,0.1929,0.050251,0.017391,0.015444,0.038168,0.477099,0.066667,0.222222,0.035714,0.461538,0.3,0.0,0.745098,0.571091,0.121285
2,0.263408,0.5,0.5,0.20202,0.052914,0.035176,0.033623,0.023166,0.01145,0.068702,0.266667,0.111111,0.035714,0.307692,0.3,0.0,0.45098,0.593472,0.227107
3,0.074328,0.5,0.5,0.868687,0.004019,0.01005,0.005217,0.007722,0.003817,0.019084,0.133333,0.037037,0.0,0.230769,0.4,0.0,0.705882,0.571091,0.121285
4,0.095938,0.5,0.5,0.969697,0.012056,0.01005,0.011014,0.0,0.007634,0.022901,0.333333,0.111111,0.0,0.307692,0.35,0.0,0.705882,0.593472,0.121285


In [27]:
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X_test.columns, index=X_test.index)
X_test_norm_df.head()

Unnamed: 0,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,complain,age,education,marital_status
646,0.254153,0.0,0.5,0.282828,0.217013,0.241206,0.107826,0.150579,0.068702,0.755725,0.4,0.222222,0.285714,0.307692,0.35,0.0,0.54902,1.0,0.227107
418,0.34485,0.0,0.0,0.181818,0.178835,0.703518,0.347246,0.131274,0.045802,0.484733,0.066667,0.259259,0.178571,0.538462,0.3,0.0,0.568627,0.571091,0.227107
609,0.368308,0.0,0.5,0.272727,0.139317,0.085427,0.044058,0.138996,0.064885,0.145038,0.133333,0.111111,0.107143,0.538462,0.1,0.0,0.901961,0.571091,0.121285
132,0.126124,0.0,0.5,0.040404,0.006028,0.0,0.003478,0.011583,0.003817,0.01145,0.066667,0.0,0.0,0.230769,0.25,0.0,0.764706,0.571091,0.121285
148,0.299887,0.0,0.5,0.050505,0.348292,0.040201,0.129275,0.123552,0.187023,0.160305,0.266667,0.37037,0.178571,0.538462,0.4,0.0,0.921569,0.593472,0.121285


In [28]:
X_train_resampled_norm_df.describe()

Unnamed: 0,income,kidhome,teenhome,recency,mntwines,mntfruits,mntmeatproducts,mntfishproducts,mntsweetproducts,mntgoldprods,numdealspurchases,numwebpurchases,numcatalogpurchases,numstorepurchases,numwebvisitsmonth,complain,age,education,marital_status
count,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0,2992.0
mean,0.336257,0.204044,0.213904,0.448361,0.260838,0.154324,0.12878,0.167177,0.12181,0.192401,0.158044,0.163782,0.117683,0.457425,0.265307,0.011029,0.520086,0.671791,0.103719
std,0.141687,0.263506,0.264405,0.294037,0.263633,0.209232,0.15213,0.220823,0.167086,0.20811,0.137611,0.096564,0.112879,0.244305,0.125573,0.104458,0.23249,0.198967,0.09981
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.221327,0.0,0.0,0.181818,0.026792,0.01005,0.013913,0.011583,0.007634,0.045802,0.066667,0.074074,0.035714,0.230769,0.15,0.0,0.352941,0.571091,0.0
50%,0.331586,0.0,0.0,0.414141,0.16276,0.065327,0.05971,0.065637,0.049618,0.118321,0.133333,0.148148,0.071429,0.384615,0.3,0.0,0.490196,0.571091,0.121285
75%,0.457599,0.5,0.5,0.707071,0.45144,0.21608,0.204058,0.243243,0.171756,0.255725,0.2,0.222222,0.178571,0.615385,0.35,0.0,0.72549,0.593472,0.121285
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

### Decision Tree

In [31]:
tree = DecisionTreeClassifier(max_depth=10)

In [32]:
tree = DecisionTreeClassifier(max_depth=5)
selector = RFE(tree, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
sc = selector.get_feature_names_out()

Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.


In [33]:
tree.fit(X_train_resampled_norm_df[sc], y_train_resampled)

In [34]:
y_pred_test_dt = tree.predict(X_test_norm_df[sc])

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"MSE, {mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {root_mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"R2 score, {tree.score(X_test_norm_df[sc], y_test): .2f}")

MAE,  0.32
MSE,  0.32
RMSE,  0.57
R2 score,  0.68


### Knn

In [None]:
#X_train_norm_df, y_train = X_train_norm_df.align(y_train, join="inner", axis=0)

In [36]:
knn = KNeighborsClassifier(n_neighbors=10)

In [38]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)  
selector.fit_transform(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
ksc = selector.get_feature_names_out()

In [40]:
knn.fit(X_train_resampled_norm_df[ksc], y_train_resampled)

In [41]:
print(f"The accuracy of the model is {knn.score(X_test[ksc], y_test)*100: .2f}%")

The accuracy of the model is  84.55%


### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, f1_score

In [43]:
log_reg = LogisticRegression()

In [44]:
selector = RFE(log_reg, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
lrsc = selector.get_feature_names_out()

Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.


In [45]:
log_reg.fit(X_train_resampled_norm_df[lrsc], y_train_resampled)

In [46]:
log_reg.score(X_test_norm_df[lrsc], y_test)

0.7522727272727273

In [47]:
y_pred = log_reg.predict(X_test_norm_df[lrsc])
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       295
           1       0.76      0.36      0.49       145

    accuracy                           0.75       440
   macro avg       0.76      0.65      0.66       440
weighted avg       0.75      0.75      0.72       440



### HYPERPARAMETER

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [5, 10, 15, 20, None],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)  

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Accuracy Score: 0.9391717522515229


In [50]:
# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV with your training data
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)

# Print the best parameters and best accuracy score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Get the best model
best_knn = grid_search.best_estimator_

# Making predictions using the best model
y_pred = best_knn.predict(X_test_norm_df)

print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best Accuracy Score: 0.916443235939498
              precision    recall  f1-score   support

           0       0.93      0.85      0.89       372
           1       0.44      0.63      0.52        68

    accuracy                           0.82       440
   macro avg       0.69      0.74      0.71       440
weighted avg       0.85      0.82      0.83       440



#### Undersampling