# Performing Classification after K-means Clustering 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from warnings import filterwarnings 
filterwarnings('ignore')

In [2]:
df=pd.read_csv(r'C:\Users\Anurag\Desktop\Machine learning Project\src\Notebook\data\clustered_data.csv')

In [3]:
df.head()

Unnamed: 0,Age,Education,Marital Status,Parental Status,Children,Income,Total_Spending,Days_as_Customer,Recency,Wines,...,Fish,Sweets,Gold,Web,Catalog,Store,Discount Purchases,Total Promo,NumWebVisitsMonth,cluster
0,18,2,0,0,0,-13587.75,-1396.375,3534.375,-51,-697,...,-67.5,-47,-61.5,8,10,4,3,0,7,1
1,18,2,0,1,2,-13587.75,-1396.375,3534.375,-51,-697,...,-67.5,-47,-61.5,1,1,2,2,0,5,0
2,18,2,1,0,0,-13587.75,-1396.375,3534.375,-51,-697,...,-67.5,-47,-61.5,8,2,10,1,0,4,1
3,18,2,1,1,1,-13587.75,-1396.375,3534.375,-51,-697,...,-67.5,-47,-61.5,2,0,4,2,0,6,0
4,18,4,1,1,1,-13587.75,-1396.375,3534.375,-51,-697,...,-67.5,-47,-61.5,5,3,6,5,0,5,1


In [4]:
from sklearn.model_selection import train_test_split 
x=df.drop('cluster',axis=1)
y=df['cluster']

## selecting the best model 

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve,confusion_matrix
from xgboost import XGBClassifier
from sklearn import metrics 



models={
    "Logistic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "Gradient Boosting":GradientBoostingClassifier(),
    "KNN":KNeighborsClassifier(),
    "Decision Tree":DecisionTreeClassifier(),
    "SVM":SVC(),
    "XGBoost":XGBClassifier(),

}

creating general function that will evaluate each model 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def evaluate(x, y, models):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model_list = []
    model_scores = []  

    for model_name, model in models.items():
        model.fit(x_train, y_train)  

        
        y_pred = model.predict(x_test)

        score = accuracy_score(y_test, y_pred)  

        print(f'---- Score for {model_name} ----')
        print(f"{score}\n")

        model_list.append(model_name)
        model_scores.append(score)  

    
    report = pd.DataFrame({'Model_name': model_list, 'Score': model_scores})
    return report


In [26]:
report=evaluate(x,y,models)

---- Score for Logistic Regression ----
0.9866071428571429

---- Score for Random Forest ----
0.9709821428571429

---- Score for AdaBoost ----
0.9776785714285714

---- Score for Gradient Boosting ----
0.9709821428571429

---- Score for KNN ----
0.9709821428571429

---- Score for Decision Tree ----
0.9665178571428571

---- Score for SVM ----
0.49776785714285715

---- Score for XGBoost ----
0.9776785714285714



logistic regression is having the highest score so we will do logistic regression 

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Lets do hyperparameter tuning 

In [29]:
param_dist = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': np.linspace(100, 500, 5).astype(int).tolist(),  # [100, 200, 300, 400, 500]
    'multi_class': ['ovr', 'multinomial'],
    'C': np.logspace(-3, 3, 7)  
}

logreg = LogisticRegression()

from sklearn.model_selection import RandomizedSearchCV
logreg_random = RandomizedSearchCV(
    estimator=logreg, 
    param_distributions=param_dist, 
    n_iter=20, 
    cv=10, 
    scoring='accuracy',  
    random_state=42, 
    n_jobs=1
)

logreg_random.fit(x_train, y_train)

print("Best Hyperparameters:", logreg_random.best_params_)

Best Hyperparameters: {'solver': 'newton-cg', 'multi_class': 'multinomial', 'max_iter': 200, 'C': np.float64(1000.0)}


In [30]:
best_logreg = LogisticRegression(
    solver='newton-cg', 
    multi_class='multinomial', 
    max_iter=200, 
    C=1000.0
)

best_logreg.fit(x_train, y_train)
y_pred = best_logreg.predict(x_test)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")

Final Model Accuracy: 0.9933


In [32]:

report = classification_report(y_test, y_pred, digits=4)

# Print Report
print("Logistic Regression")
print(f"Accuracy Score value: {accuracy:.4f}")
print(report)


Logistic Regression
Accuracy Score value: 0.9933
              precision    recall  f1-score   support

           0     1.0000    0.9865    0.9932       223
           1     0.9868    1.0000    0.9934       225

    accuracy                         0.9933       448
   macro avg     0.9934    0.9933    0.9933       448
weighted avg     0.9934    0.9933    0.9933       448

