# Multiclass Classification

Connect to Google Drive

In [None]:
#Mount the google drive connection to our dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Load the dataset

In [None]:
import pandas as pd
df_train = pd.read_csv('/content/drive/My Drive/AI/datasets/customer_segmentation_train.csv')

In [None]:
df_train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [None]:
df_train["Profession"].unique()

array(['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist',
       'Executive', 'Doctor', 'Homemaker', 'Marketing', nan], dtype=object)

In [None]:
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [None]:
df_train.isna().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

DF management

In [None]:
df_train.drop(columns=['ID', 'Var_1'], inplace=True)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
numeric_features = ['Age', 'Work_Experience', 'Family_Size']
cat_features = ['Gender', 'Ever_Married', 'Graduated']
ordinal_features=['Profession', 'Spending_Score']

In [None]:
target_encoder = OrdinalEncoder()
df_train["Segmentation"] = target_encoder.fit_transform(df_train[["Segmentation"]])

In [None]:
X = df_train.drop(columns=['Segmentation'])

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [None]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, cat_features),
        ('ordinal', ordinal_transformer, ordinal_features)
    ])

# One vs One strategy

In [None]:
ovo_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsOneClassifier(SGDClassifier(random_state=42)))
])

In [None]:
y = df_train['Segmentation']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
ovo_clf_pipeline.fit(X_train, y_train)

Confusion Matrix

In [None]:
y_val_pred = ovo_clf_pipeline.predict(X_val)

conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

Confusion Matrix for Validation Set:
[[  0 348   0  18]
 [  1 360   0  22]
 [  0 366   1  22]
 [  0 258   1 217]]


In [None]:
print("\nClassification Report for Validation Set:")
print(classification_report(y_val, y_val_pred))


Classification Report for Validation Set:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       366
         1.0       0.27      0.94      0.42       383
         2.0       0.50      0.00      0.01       389
         3.0       0.78      0.46      0.57       476

    accuracy                           0.36      1614
   macro avg       0.39      0.35      0.25      1614
weighted avg       0.41      0.36      0.27      1614



In [None]:
y_test_pred = ovo_clf_pipeline.predict(X_test)

conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix for Test Set:")
print(conf_matrix_test)

Confusion Matrix for Test Set:
[[  2 399   1  29]
 [  1 334   1  10]
 [  0 364   0  19]
 [  1 256   0 197]]


In [None]:
print("\nClassification Report for Test Set:")
print(classification_report(y_test, y_test_pred))


Classification Report for Test Set:
              precision    recall  f1-score   support

         0.0       0.50      0.00      0.01       431
         1.0       0.25      0.97      0.39       346
         2.0       0.00      0.00      0.00       383
         3.0       0.77      0.43      0.56       454

    accuracy                           0.33      1614
   macro avg       0.38      0.35      0.24      1614
weighted avg       0.40      0.33      0.24      1614



Precision/Recall and F1

In [None]:
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

In [None]:
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1-score:", f1_val)

Precision: 0.414024619414954
Recall: 0.35811648079306074
F1-score: 0.2703865952846368


In [None]:
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

In [None]:
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

Precision: 0.40374870560824166
Recall: 0.3302354399008674
F1-score: 0.24305716143307568


# One vs All strategy (One vs Rest)

In [None]:
ova_clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsRestClassifier(SGDClassifier(random_state=42)))
])

In [None]:
ova_clf_pipeline.fit(X_train, y_train)

Confusion Matrix

In [None]:
y_val_pred = ova_clf_pipeline.predict(X_val)

conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

Confusion Matrix for Validation Set:
[[  0 319   0  47]
 [  0 345   0  38]
 [  0 349   0  40]
 [  0 187   0 289]]


In [None]:
print("\nClassification Report for Validation Set:")
print(classification_report(y_val, y_val_pred))


Classification Report for Validation Set:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       366
         1.0       0.29      0.90      0.44       383
         2.0       0.00      0.00      0.00       389
         3.0       0.70      0.61      0.65       476

    accuracy                           0.39      1614
   macro avg       0.25      0.38      0.27      1614
weighted avg       0.27      0.39      0.29      1614



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_test_pred = ova_clf_pipeline.predict(X_test)

conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix for Test Set:")
print(conf_matrix_test)

Confusion Matrix for Test Set:
[[  0 375   0  56]
 [  0 321   0  25]
 [  0 343   0  40]
 [  0 198   0 256]]


In [None]:
print("\nClassification Report for Test Set:")
print(classification_report(y_test, y_test_pred))


Classification Report for Test Set:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       431
         1.0       0.26      0.93      0.41       346
         2.0       0.00      0.00      0.00       383
         3.0       0.68      0.56      0.62       454

    accuracy                           0.36      1614
   macro avg       0.23      0.37      0.26      1614
weighted avg       0.25      0.36      0.26      1614



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Let's try RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

ovo_rfclf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsOneClassifier(RandomForestClassifier(random_state=42)))
])

In [None]:
ovo_rfclf_pipeline.fit(X_train, y_train)

Confusion Matrix

In [None]:
y_val_pred = ovo_rfclf_pipeline.predict(X_val)

conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

Confusion Matrix for Validation Set:
[[151  77  55  83]
 [110 125  97  51]
 [ 35 123 179  52]
 [ 85  32  32 327]]


In [None]:
print("\nClassification Report for Validation Set:")
print(classification_report(y_val, y_val_pred))


Classification Report for Validation Set:
              precision    recall  f1-score   support

         0.0       0.40      0.41      0.40       366
         1.0       0.35      0.33      0.34       383
         2.0       0.49      0.46      0.48       389
         3.0       0.64      0.69      0.66       476

    accuracy                           0.48      1614
   macro avg       0.47      0.47      0.47      1614
weighted avg       0.48      0.48      0.48      1614



In [None]:
ova_rfclf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneVsRestClassifier(RandomForestClassifier(random_state=42)))
])

In [None]:
ova_rfclf_pipeline.fit(X_train, y_train)

Confusion Matrix

In [None]:
y_val_pred = ova_rfclf_pipeline.predict(X_val)

conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

Confusion Matrix for Validation Set:
[[137  93  50  86]
 [100 131  98  54]
 [ 49 105 187  48]
 [ 79  40  37 320]]


In [None]:
print("\nClassification Report for Validation Set:")
print(classification_report(y_val, y_val_pred))


Classification Report for Validation Set:
              precision    recall  f1-score   support

         0.0       0.38      0.37      0.37       366
         1.0       0.36      0.34      0.35       383
         2.0       0.50      0.48      0.49       389
         3.0       0.63      0.67      0.65       476

    accuracy                           0.48      1614
   macro avg       0.47      0.47      0.47      1614
weighted avg       0.48      0.48      0.48      1614



# Let's try Ensemble Models

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

base_classifiers = [
    ('rfc', RandomForestClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42)),
    ('svc', SVC(random_state=42))
]

pipelines = []
for name, classifier in base_classifiers:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    pipelines.append((name, pipeline))

ensemble_classifier = VotingClassifier(estimators=pipelines, voting='hard')

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'rfc__classifier__n_estimators': [50, 100, 200],
    'svc__classifier__C': [0.1, 1, 10]
}
grid_search = GridSearchCV(ensemble_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
best_pipeline = grid_search.best_estimator_

In [None]:
best_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix

In [None]:
y_val_pred = best_pipeline.predict(X_val)

conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix for Validation Set:")
print(conf_matrix_val)

Confusion Matrix for Validation Set:
[[196  36  59  75]
 [153  61 122  47]
 [ 80  50 203  56]
 [ 99  15  13 349]]


In [None]:
print("\nClassification Report for Validation Set:")
print(classification_report(y_val, y_val_pred))


Classification Report for Validation Set:
              precision    recall  f1-score   support

         0.0       0.37      0.54      0.44       366
         1.0       0.38      0.16      0.22       383
         2.0       0.51      0.52      0.52       389
         3.0       0.66      0.73      0.70       476

    accuracy                           0.50      1614
   macro avg       0.48      0.49      0.47      1614
weighted avg       0.49      0.50      0.48      1614



# Binary Classification => The label has only two different values}
# Multi Label Classification => The model predicts more than 1 label
# Multi Output Classification => The model predicts more than 1 label that have to be done with multi label classification