# Team 8 : Assignment 4 (OvA Classifier)

## Headers

In [176]:
# all imports here
from sklearn.svm import SVC
import sklearn as svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

## Importing CSV

In [177]:
# importing dataset from csv file
og_train_data = pd.read_csv("Customer_train.csv")
og_test_data = pd.read_csv("Customer_test.csv")


# printing number of rows in the datasets
print("Number of rows in the training dataset: ", len(og_train_data))
print("Number of rows in the test dataset: ", len(og_test_data))

Number of rows in the training dataset:  1838
Number of rows in the test dataset:  789


## Pre-Processing Training Dataset

In [178]:
# function to check skewness & fill missing values with mean or median based on skewness
def fill_missing_based_on_skewness(df, column):
    if df[column].isnull().sum() > 0:
        skewness = df[column].skew()
        if abs(skewness) > 0.5:
            fill_value = df[column].median()
            method = "median"
        else:
            fill_value = df[column].mean()
            method = "mean"

        df[column].fillna(fill_value, inplace=True)
        print(f"Filled missing '{column}' with {method}: {fill_value}\n")


# function to check skewness & transform data based on it using log transformation
def transform_if_skewed(df, column):
    if (
        df[column].isnull().sum() == 0
    ):  # ensuring no missing values before transformation
        skewness = df[column].skew()
        print(f"Skewness of '{column}': {skewness}")

        # applying log transformation here if skewness > 0.5
        if abs(skewness) > 0.5:
            df[column] = np.log1p(df[column])
            print(f"Applied log transformation to '{column}' due to skewness.\n")
        else:
            print(
                f"'{column}' is not significantly skewed; no transformation applied.\n"
            )
    else:
        print(
            f"Column '{column}' has missing values, handle them before transformation.\n"
        )


# function to fill missing values of categorical column with mode
def fill_missing_categorical(df, column):
    if df[column].isnull().sum() > 0:
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)
        print(f"Filled missing '{column}' with mode: {mode_value}\n")


# making new df and processing in that one
processed_train_data = og_train_data.copy()

# dropping ID column since no use of it in training classifier
processed_train_data.drop("ID", axis=1, inplace=True)

# calling missing values function on columns as below
fill_missing_based_on_skewness(processed_train_data, "Work_Experience")
fill_missing_based_on_skewness(processed_train_data, "Family_Size")

# calling missing values function on categorical columns
fill_missing_categorical(processed_train_data, "Var_1")
fill_missing_categorical(processed_train_data, "Gender")
fill_missing_categorical(processed_train_data, "Ever_Married")
fill_missing_categorical(processed_train_data, "Graduated")
fill_missing_categorical(processed_train_data, "Profession")
fill_missing_categorical(processed_train_data, "Spending_Score")

# calling transformation function on columns as below
transform_if_skewed(processed_train_data, "Work_Experience")
transform_if_skewed(processed_train_data, "Family_Size")
transform_if_skewed(processed_train_data, "Age")

# printing
processed_train_data

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.569560677667064
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.0098346177467532
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.7207833242162786
Applied log transformation to 'Age' due to skewness.



Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,3.367296,Yes,Artist,2.197225,Low,0.693147,Cat_6,B
1,Male,Yes,3.850148,Yes,Entertainment,0.693147,Average,1.609438,Cat_7,D
2,Male,No,4.276666,No,Lawyer,0.693147,Low,1.098612,Cat_6,D
3,Female,Yes,3.970292,No,Doctor,1.609438,Low,0.693147,Cat_4,C
4,Male,No,3.295837,Yes,Healthcare,0.000000,Low,1.386294,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,3.258097,Yes,Healthcare,2.564949,High,1.098612,Cat_6,B
1834,Female,Yes,3.637586,Yes,Engineer,0.693147,Average,1.386294,Cat_4,B
1835,Male,Yes,3.850148,Yes,Artist,0.000000,High,1.609438,Cat_6,D
1836,Male,Yes,3.332205,Yes,Healthcare,1.386294,Low,1.098612,Cat_6,D


## Scaling Numerical Columns

In [179]:
# initializing the MinMaxScaler
scaler = MinMaxScaler()

# selecting columns to be scaled
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]

# scaling columns to the range [0, 1]
processed_train_data[columns_to_scale] = scaler.fit_transform(
    processed_train_data[columns_to_scale]
)

# printing
processed_train_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,0.271869,Yes,Artist,0.811368,Low,0.000000,Cat_6,B
1,Male,Yes,0.582310,Yes,Entertainment,0.255958,Average,0.569323,Cat_7,D
2,Male,No,0.856534,No,Lawyer,0.255958,Low,0.251930,Cat_6,D
3,Female,Yes,0.659555,No,Doctor,0.594316,Low,0.000000,Cat_4,C
4,Male,No,0.225925,Yes,Healthcare,0.000000,Low,0.430677,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,0.201661,Yes,Healthcare,0.947157,High,0.251930,Cat_6,B
1834,Female,Yes,0.445648,Yes,Engineer,0.255958,Average,0.430677,Cat_4,B
1835,Male,Yes,0.582310,Yes,Artist,0.000000,High,0.569323,Cat_6,D
1836,Male,Yes,0.249307,Yes,Healthcare,0.511916,Low,0.251930,Cat_6,D


## One-Hot Encoding for Categorical Columns

In [180]:
# list of categorical columns to be encoded
categorical_columns = [
    "Gender",
    "Ever_Married",
    "Graduated",
    "Profession",
    "Spending_Score",
    "Var_1",
]

# label encoding for binary categorical features
label_encoder = LabelEncoder()
for col in ["Gender", "Ever_Married", "Graduated"]:
    processed_train_data[col] = label_encoder.fit_transform(processed_train_data[col])

# one-hot encoding for multi-class categorical features
processed_train_data = pd.get_dummies(
    processed_train_data,
    columns=["Profession", "Spending_Score", "Var_1"],
    drop_first=True,
)

# printing
processed_train_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Family_Size,Segmentation,Profession_Doctor,Profession_Engineer,Profession_Entertainment,...,Profession_Lawyer,Profession_Marketing,Spending_Score_High,Spending_Score_Low,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,0,0,0.271869,1,0.811368,0.000000,B,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,1,1,0.582310,1,0.255958,0.569323,D,False,False,True,...,False,False,False,False,False,False,False,False,False,True
2,1,0,0.856534,0,0.255958,0.251930,D,False,False,False,...,True,False,False,True,False,False,False,False,True,False
3,0,1,0.659555,0,0.594316,0.000000,C,True,False,False,...,False,False,False,True,False,False,True,False,False,False
4,1,0,0.225925,1,0.000000,0.430677,D,False,False,False,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1833,0,1,0.201661,1,0.947157,0.251930,B,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1834,0,1,0.445648,1,0.255958,0.430677,B,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1835,1,1,0.582310,1,0.000000,0.569323,D,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1836,1,1,0.249307,1,0.511916,0.251930,D,False,False,False,...,False,False,False,True,False,False,False,False,True,False


## Pre-Processing Testing Dataset

In [181]:
# preparing testing dataset
processed_test_data = og_test_data.copy()

# dropping ID column since even training data has total only 22 columns (ID was not there)
processed_test_data.drop("ID", axis=1, inplace=True)

# same preprocessing steps to the test data
fill_missing_based_on_skewness(processed_test_data, "Work_Experience")
fill_missing_based_on_skewness(processed_test_data, "Family_Size")

fill_missing_categorical(processed_test_data, "Var_1")
fill_missing_categorical(processed_test_data, "Gender")
fill_missing_categorical(processed_test_data, "Ever_Married")
fill_missing_categorical(processed_test_data, "Graduated")
fill_missing_categorical(processed_test_data, "Profession")
fill_missing_categorical(processed_test_data, "Spending_Score")

transform_if_skewed(processed_test_data, "Work_Experience")
transform_if_skewed(processed_test_data, "Family_Size")
transform_if_skewed(processed_test_data, "Age")

# scaling test dataset with same scaler
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]
processed_test_data[columns_to_scale] = scaler.transform(
    processed_test_data[columns_to_scale]
)

# label encoding for binary categorical features
label_encoder = LabelEncoder()
for col in ["Gender", "Ever_Married", "Graduated"]:
    processed_test_data[col] = label_encoder.fit_transform(processed_test_data[col])

# one-hot encoding for multi-class categorical features
processed_test_data = pd.get_dummies(
    processed_test_data,
    columns=["Profession", "Spending_Score", "Var_1"],
    drop_first=True,
)

# printing
processed_test_data

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.5263872720417662
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.2005644040644883
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.6667761490536537
Applied log transformation to 'Age' due to skewness.



Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Family_Size,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,...,Profession_Lawyer,Profession_Marketing,Spending_Score_High,Spending_Score_Low,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,1,0,0.000000,0,0.255958,0.682606,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,1,0,0.478626,1,0.767874,0.000000,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,0,0.509995,1,0.255958,0.000000,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
3,1,1,0.595846,1,0.000000,0.569323,False,False,False,True,...,False,False,True,False,False,False,False,False,True,False
4,0,1,0.760395,1,0.000000,0.430677,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784,1,0,0.122836,0,0.000000,0.000000,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
785,0,0,0.539904,1,0.255958,0.569323,True,False,False,False,...,False,False,False,True,True,False,False,False,False,False
786,1,0,0.032978,0,0.255958,0.682606,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
787,0,0,0.374137,0,0.000000,0.778385,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False


## Functions to Train the Model

In [182]:
# function to train a binary classifier for each class
def train_ova_classifiers(X_train, y_train, class_labels):
    # dictionary to store the classifiers for each class vs all others
    classifiers = {}

    # iterating over each class
    for class_label in class_labels:
        # creating a binary label for the current class (1 for the current class & 0 for others)
        binary_labels = np.where(y_train == class_label, 1, 0)

        # training an SVM classifier for this binary problem (trying 'rbf' kernel)
        clf = SVC(kernel="rbf", probability=True, C=10, gamma=0.2)

        # using probability=True to get confidence scores
        clf.fit(X_train, binary_labels)

        # storing the classifier
        classifiers[class_label] = clf

    return classifiers


# function to predict the classes for new data points using the OvA classifiers
def ova_predict(X_test, classifiers):
    # list to store the final predictions for each test point
    predictions = []

    # iterating over each test point
    for x in X_test:
        # storing the confidence scores (probability) for each class
        confidence_scores = {}

        # getting the confidence score from each classifier
        for class_label, clf in classifiers.items():
            # predicting the probability that x belongs to the current class (1)
            prob = clf.predict_proba([x])[0][1]

            # [0][1] gets the probability for class 1
            confidence_scores[class_label] = prob

        # predicted class is the one with highest confidence score
        final_class = max(confidence_scores, key=confidence_scores.get)
        predictions.append(final_class)

    return np.array(predictions)

## Making Predictions

In [183]:
# getting X_test
X_test = processed_test_data.values

# initialising class labels with unique values in Segmentation column
class_labels = processed_train_data["Segmentation"].unique()

# creating a copy of the processes df & making X_train (np array)
X_train_copy = processed_train_data.copy()
X_train_copy.drop("Segmentation", axis=1, inplace=True)
X_train = X_train_copy.values

# similarly getting y_train
y_train = processed_train_data["Segmentation"].values

# training OvA classifiers
classifiers = train_ova_classifiers(X_train, y_train, class_labels)

# predicting classes for new data points
predictions = ova_predict(X_test, classifiers)

# converting the predictions into a pandas DataFrame
predictions_df = pd.DataFrame(predictions, columns=["predictions"])

# saving the df to a CSV file
predictions_df.to_csv("ova.csv", index=False)

## Getting Accuracy with Validation Set

In [184]:
# Splitting the data into features (X) and labels (y)
y = processed_train_data["Segmentation"].values
new_copy = processed_train_data.copy()
X = new_copy.drop("Segmentation", axis=1).values

# Splitting into 80% training and 20% validation set (randomly)
new_train_set, new_valid_set, y_train_new, y_valid_new = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a dictionary to simulate 'train_ova_classifiers' and 'ova_predict' for OvA case
new_classifiers = train_ova_classifiers(new_train_set, y_train_new, class_labels)

# Making predictions on the validation set
predictions = ova_predict(new_valid_set, classifiers)

# Calculating accuracy
accuracy = accuracy_score(y_valid_new, predictions)

print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.4483695652173913
