## Import Library

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

## Load Cluster Dataset

In [11]:
df = pd.read_csv('../data/Wholesale_customer_cluster_data.csv')
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Cluster
0,2,3,12669,9656,7561,214,2674,1338,2
1,2,3,7057,9810,9568,1762,3293,1776,0
2,2,3,6353,8808,7684,2405,3516,7844,0
3,1,3,13265,1196,4221,6404,507,1788,1
4,2,3,22615,5410,7198,3915,1777,5185,1


## Data Scaling & Splitting

In [12]:
numeric_features = df.drop(columns=['Channel','Region','Cluster'])
scaler = StandardScaler()

X_scaling = scaler.fit_transform(numeric_features)

X = pd.DataFrame(X_scaling, columns=numeric_features.columns, index=numeric_features.index)
y = df['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Modeling & Evaluation

In [13]:
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
lr = LogisticRegression()
nb = GaussianNB()
svm = SVC()

cv_results = []
models = {
    'K-Nearest Neighbors': knn,
    'Decision Tree': dt,
    'Random Forest': rf,
    'Logistic Regression': lr,
    'Naive Bayes': nb,
    'Support Vector Machine': svm
}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    cv_mean = cv_scores.mean()
    cv_results.append({
        'Model': name,
        'Mean': cv_mean
    })

df_cv = pd.DataFrame(cv_results)

df_cv

Unnamed: 0,Model,Mean
0,K-Nearest Neighbors,0.920443
1,Decision Tree,0.914688
2,Random Forest,0.937425
3,Logistic Regression,0.963018
4,Naive Bayes,0.912072
5,Support Vector Machine,0.931871


In [14]:
best_model = LogisticRegression()
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

model_eval = accuracy_score(y_pred, y_test)

print(f"Logistic Regression Accuracy: {model_eval}")

Logistic Regression Accuracy: 0.9886363636363636


## Export Model

In [15]:
joblib.dump(best_model, '../models/wholesale_customer_classification.pkl')

['../models/wholesale_customer_classification.pkl']