# Team 8 : Assignment 4 (OvO Classifier)

## Headers

In [60]:
# all imports here
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import itertools
import argparse
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing CSV

In [61]:
# importing dataset from csv file
og_train_data = pd.read_csv("Customer_train.csv")
og_test_data = pd.read_csv("Customer_test.csv")


# printing number of rows in the datasets
print("Number of rows in the training dataset: ", len(og_train_data))
print("Number of rows in the test dataset: ", len(og_test_data))

Number of rows in the training dataset:  1838
Number of rows in the test dataset:  789


## Pre-Processing Training Dataset

In [62]:
# function to check skewness & fill missing values with mean or median based on skewness
def fill_missing_based_on_skewness(df, column):
    if df[column].isnull().sum() > 0:
        skewness = df[column].skew()
        if abs(skewness) > 0.5:
            fill_value = df[column].median()
            method = "median"
        else:
            fill_value = df[column].mean()
            method = "mean"

        df[column].fillna(fill_value, inplace=True)
        print(f"Filled missing '{column}' with {method}: {fill_value}\n")


# function to check skewness & transform data based on it using log transformation
def transform_if_skewed(df, column):
    if (
        df[column].isnull().sum() == 0
    ):  # ensuring no missing values before transformation
        skewness = df[column].skew()
        print(f"Skewness of '{column}': {skewness}")

        # applying log transformation here if skewness > 0.5
        if abs(skewness) > 0.5:
            df[column] = np.log1p(df[column])
            print(f"Applied log transformation to '{column}' due to skewness.\n")
        else:
            print(
                f"'{column}' is not significantly skewed; no transformation applied.\n"
            )
    else:
        print(
            f"Column '{column}' has missing values, handle them before transformation.\n"
        )


# function to fill missing values of categorical column with mode
def fill_missing_categorical(df, column):
    if df[column].isnull().sum() > 0:
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)
        print(f"Filled missing '{column}' with mode: {mode_value}\n")


# making new df and processing in that one
processed_train_data = og_train_data.copy()

# dropping ID column since no use of it in training classifier
processed_train_data.drop("ID", axis=1, inplace=True)

# calling missing values function on columns as below
fill_missing_based_on_skewness(processed_train_data, "Work_Experience")
fill_missing_based_on_skewness(processed_train_data, "Family_Size")

# calling missing values function on categorical columns
fill_missing_categorical(processed_train_data, "Var_1")
fill_missing_categorical(processed_train_data, "Gender")
fill_missing_categorical(processed_train_data, "Ever_Married")
fill_missing_categorical(processed_train_data, "Graduated")
fill_missing_categorical(processed_train_data, "Profession")
fill_missing_categorical(processed_train_data, "Spending_Score")

# calling transformation function on columns as below
transform_if_skewed(processed_train_data, "Work_Experience")
transform_if_skewed(processed_train_data, "Family_Size")
transform_if_skewed(processed_train_data, "Age")

# printing
processed_train_data

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.569560677667064
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.0098346177467532
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.7207833242162786
Applied log transformation to 'Age' due to skewness.



Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,3.367296,Yes,Artist,2.197225,Low,0.693147,Cat_6,B
1,Male,Yes,3.850148,Yes,Entertainment,0.693147,Average,1.609438,Cat_7,D
2,Male,No,4.276666,No,Lawyer,0.693147,Low,1.098612,Cat_6,D
3,Female,Yes,3.970292,No,Doctor,1.609438,Low,0.693147,Cat_4,C
4,Male,No,3.295837,Yes,Healthcare,0.000000,Low,1.386294,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,3.258097,Yes,Healthcare,2.564949,High,1.098612,Cat_6,B
1834,Female,Yes,3.637586,Yes,Engineer,0.693147,Average,1.386294,Cat_4,B
1835,Male,Yes,3.850148,Yes,Artist,0.000000,High,1.609438,Cat_6,D
1836,Male,Yes,3.332205,Yes,Healthcare,1.386294,Low,1.098612,Cat_6,D


## Scaling Numerical Columns

In [63]:
# initializing the MinMaxScaler
scaler = MinMaxScaler()

# selecting columns to be scaled
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]

# scaling columns to the range [0, 1]
processed_train_data[columns_to_scale] = scaler.fit_transform(
    processed_train_data[columns_to_scale]
)

# printing
processed_train_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,0.271869,Yes,Artist,0.811368,Low,0.000000,Cat_6,B
1,Male,Yes,0.582310,Yes,Entertainment,0.255958,Average,0.569323,Cat_7,D
2,Male,No,0.856534,No,Lawyer,0.255958,Low,0.251930,Cat_6,D
3,Female,Yes,0.659555,No,Doctor,0.594316,Low,0.000000,Cat_4,C
4,Male,No,0.225925,Yes,Healthcare,0.000000,Low,0.430677,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,0.201661,Yes,Healthcare,0.947157,High,0.251930,Cat_6,B
1834,Female,Yes,0.445648,Yes,Engineer,0.255958,Average,0.430677,Cat_4,B
1835,Male,Yes,0.582310,Yes,Artist,0.000000,High,0.569323,Cat_6,D
1836,Male,Yes,0.249307,Yes,Healthcare,0.511916,Low,0.251930,Cat_6,D


## Training the Model

In [64]:
# preparing features & target for the entire training set
X = processed_train_data.drop("Segmentation", axis=1)
y = processed_train_data["Segmentation"]

# doing one-hot encoding on categorical variables
X = pd.get_dummies(X, drop_first=True)

# initializing & training the OvO SVM classifier on full training set
ovo_svm = SVC(kernel="rbf", decision_function_shape="ovo", random_state=42)
ovo_svm.fit(X, y)

## Pre-Processing Testing Dataset

In [65]:
# preparing testing dataset
processed_test_data = og_test_data.copy()

# same preprocessing steps to the test data
fill_missing_based_on_skewness(processed_test_data, "Work_Experience")
fill_missing_based_on_skewness(processed_test_data, "Family_Size")

fill_missing_categorical(processed_test_data, "Var_1")
fill_missing_categorical(processed_test_data, "Gender")
fill_missing_categorical(processed_test_data, "Ever_Married")
fill_missing_categorical(processed_test_data, "Graduated")
fill_missing_categorical(processed_test_data, "Profession")
fill_missing_categorical(processed_test_data, "Spending_Score")

transform_if_skewed(processed_test_data, "Work_Experience")
transform_if_skewed(processed_test_data, "Family_Size")
transform_if_skewed(processed_test_data, "Age")

# scaling test dataset with same scaler
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]
processed_test_data[columns_to_scale] = scaler.transform(
    processed_test_data[columns_to_scale]
)

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.5263872720417662
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.2005644040644883
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.6667761490536537
Applied log transformation to 'Age' due to skewness.



## Making Predictions

In [66]:
# preparing the test features
X_test = processed_test_data.drop(["ID"], axis=1)
X_test = pd.get_dummies(X_test, drop_first=True)

# ensure that X_test has the same columns as X
for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X.columns]

# now making predictions on the test set
test_predictions = ovo_svm.predict(X_test)

# creating a df with the predictions as asked
output_df = pd.DataFrame({"ID": og_test_data["ID"], "predicted": test_predictions})

# saving to a file
output_df.to_csv("ovo.csv", index=False)

print("Predictions saved to 'ovo.csv'")

Predictions saved to 'ovo.csv'
