In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

import optuna
import optuna.visualization as vis
import time

import scipy.stats as st
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv("../data/clean/cleaned_superstore_data.csv")  
df.head()

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df = df.dropna()

In [None]:
df.drop(["total_spending","age_group","total_purchases","mnttotal",	"yearmonth"], axis=1, inplace=True)
df

In [None]:
df["dt_customer"] = pd.to_datetime(df["dt_customer"], format="%Y-%m-%d")

In [None]:
features = df.drop(columns=["response","id","year_birth","dt_customer"])
target = df["response"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [None]:
#Label encoding of the categorical columns

In [None]:
education_order = df.groupby("education")[["response"]].mean().sort_values(by="response", ascending=True)
education_order

In [None]:
marital_status_order = df.groupby("marital_status")[["response"]].mean().sort_values(by="response", ascending=True)
marital_status_order

In [None]:
numerical_columns = X_train.select_dtypes(include=['int64','float64']).columns
categorical_columns= X_train.select_dtypes(include=["object"]).columns


educational_level_order = {"Basic": 1, 
                           "Bachelor": education_order.iloc[1,0] / education_order.iloc[0,0],
                          "Master": education_order.iloc[2,0] / education_order.iloc[0,0],
                          "PhD": education_order.iloc[3,0] / education_order.iloc[0,0]}


X_train["education"] = X_train["education"].map(educational_level_order)
X_test["education"] = X_test["education"].map(educational_level_order)

marital_status_level = {"Married": 1,
                        "Single": marital_status_order.iloc[1,0] / marital_status_order.iloc[0,0],
                        "Divorced": marital_status_order.iloc[2,0] / marital_status_order.iloc[0,0],
                        "Widow": marital_status_order.iloc[3,0] / marital_status_order.iloc[0,0],
                        "NI": marital_status_order.iloc[4,0] / marital_status_order.iloc[0,0]}

X_train["marital_status"] = X_train["marital_status"].map(marital_status_level)
X_test["marital_status"] = X_test["marital_status"].map(marital_status_level)

In [None]:
X_train_final = pd.concat([X_train[numerical_columns], X_train[["education","marital_status"]]], axis=1)
X_test_final = pd.concat([X_test[numerical_columns], X_test[["education","marital_status"]]], axis=1)

### Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_final, y_train)
#X_test_resampled, y_test_resampled = ros.fit_resample(X_test_final, y_test)

In [None]:
y_train_resampled.value_counts()

In [None]:
normalizer = MinMaxScaler()

In [None]:
normalizer.fit(X_train_resampled)

In [None]:
# save the nomralizer with pickle in the "../scalers/" folder

In [None]:
X_train_resampled.columns

In [None]:
X_test = X_test[X_train_resampled.columns]

In [None]:
X_train_resampled_norm = normalizer.transform(X_train_resampled)
X_test_norm = normalizer.transform(X_test)

In [None]:
X_train_resampled_norm_df = pd.DataFrame(X_train_resampled_norm, columns=X_train_resampled.columns, index=X_train_resampled.index )
X_train_resampled_norm_df.head()

In [None]:
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X_test.columns, index=X_test.index)
X_test_norm_df.head()

In [None]:
X_train_resampled_norm_df.describe()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

### Decision Tree

In [None]:
tree = DecisionTreeClassifier(max_depth=10)

In [None]:
tree = DecisionTreeClassifier(max_depth=5)
selector = RFE(tree, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
sc = selector.get_feature_names_out()

In [None]:
tree.fit(X_train_resampled_norm_df[sc], y_train_resampled)

In [None]:
y_pred_test_dt = tree.predict(X_test_norm_df[sc])

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"MSE, {mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {root_mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"R2 score, {tree.score(X_test_norm_df[sc], y_test): .2f}")

### Knn

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)  
selector.fit_transform(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
ksc = selector.get_feature_names_out()

In [None]:
knn.fit(X_train_resampled_norm_df[ksc], y_train_resampled)

In [None]:
print(f"The accuracy of the model is {knn.score(X_test[ksc], y_test)*100: .2f}%")

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, f1_score

In [None]:
log_reg = LogisticRegression()

In [None]:
selector = RFE(log_reg, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
lrsc = selector.get_feature_names_out()

In [None]:
log_reg.fit(X_train_resampled_norm_df[lrsc], y_train_resampled)

In [None]:
log_reg.score(X_test_norm_df[lrsc], y_test)

In [None]:
y_pred = log_reg.predict(X_test_norm_df[lrsc])
print(classification_report(y_pred, y_test))

### HYPERPARAMETER

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [5, 10, 15, 20, None],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)  

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)


In [None]:
# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV with your training data
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)

# Print the best parameters and best accuracy score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Get the best model
best_knn = grid_search.best_estimator_

# Making predictions using the best model
y_pred = best_knn.predict(X_test_norm_df)

print(classification_report(y_test, y_pred))


### Smote

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_final, y_train)

In [None]:
y_train_resampled.value_counts()

In [None]:
normalizer = MinMaxScaler()

In [None]:
normalizer.fit(X_train_resampled)

In [None]:
X_test = X_test[X_train_resampled.columns]

In [None]:
X_train_resampled_norm = normalizer.transform(X_train_resampled)
X_test_norm = normalizer.transform(X_test)

In [None]:
X_train_resampled_norm_df = pd.DataFrame(X_train_resampled_norm, columns=X_train_resampled.columns, index=X_train_resampled.index )
X_train_resampled_norm_df.head()

In [None]:
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X_test.columns, index=X_test.index)
X_test_norm_df.head()

In [None]:
X_train_resampled_norm_df.describe()

### Smote-decision tree

In [None]:
tree = DecisionTreeClassifier(max_depth=10)

In [None]:
tree = DecisionTreeClassifier(max_depth=5)
selector = RFE(tree, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
sc = selector.get_feature_names_out()

In [None]:
tree.fit(X_train_resampled_norm_df[sc], y_train_resampled)

In [None]:
y_pred_test_dt = tree.predict(X_test_norm_df[sc])

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"MSE, {mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {root_mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"R2 score, {tree.score(X_test_norm_df[sc], y_test): .2f}")

### Smote- knn

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)  
selector.fit_transform(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
sksc = selector.get_feature_names_out()

In [None]:
knn.fit(X_train_resampled_norm_df[sksc], y_train_resampled)

In [None]:
print(f"The accuracy of the model is {knn.score(X_test[sksc], y_test)*100: .2f}%")

### smote log regression

In [None]:
log_reg = LogisticRegression()

In [None]:
selector = RFE(log_reg, n_features_to_select=10, verbose=5)
selector.fit(X_train_resampled_norm_df, y_train_resampled)
selector.transform(X_test_norm_df)
slrsc = selector.get_feature_names_out()

In [None]:
log_reg.fit(X_train_resampled_norm_df[slrsc], y_train_resampled)

In [None]:
log_reg.score(X_test_norm_df[slrsc], y_test)

In [None]:
y_pred = log_reg.predict(X_test_norm_df[slrsc])
print(classification_report(y_pred, y_test))

### Smote- hyperparameter

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [5, 10, 15, 20, None],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)  

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV with your training data
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_resampled_norm_df, y_train_resampled)

# Print the best parameters and best accuracy score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Get the best model
best_knn = grid_search.best_estimator_

# Making predictions using the best model
y_pred = best_knn.predict(X_test_norm_df)

print(classification_report(y_test, y_pred))