This notebook aims to evaluate **SVM and Naive Bayes** models on a Multiclass Classification task. To this end, I generated a dataset with 10,000 records and 50 features. In the first step, SVM and NB are trained. Then, the dataset is manipulated by randomly removing 30% of instances. In the next step, these two models are trained by considering two approaches (**mean values and KNN**) for re-filling Null values and the results will be compared.

# Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
import random
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_score
from time import perf_counter
import warnings
warnings.filterwarnings("ignore")

# Dataset

In [11]:
# Function to create a synthetic dataset
def create_dataset(n_samples, n_features, n_classes):
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=n_features // n_classes,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=n_classes,
                               n_clusters_per_class=1,
                               flip_y=0.05,
                               random_state=42)

    # Convert to DataFrame for better visualization and manipulation
    feature_names = [f'feature_{i}' for i in range(n_features)]
    X_df = pd.DataFrame(X, columns=feature_names)
    y_df = pd.Series(y, name='label')

    # Concatenate features and class into one DataFrame
    dataset = pd.concat([X_df, y_df], axis=1)

    return dataset

#SVM

In [12]:
# Parameters
n_samples = 10000
n_features = 50
n_classes = 10

df = create_dataset(n_samples, n_features, n_classes)

In [13]:
df.describe(include='all')

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,label
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-0.006882,0.00436,-0.19423,-0.003997,-0.011749,-0.008294,-0.001278,0.000378,-0.006762,-0.007168,...,0.005367,0.008656,-0.003031,-0.001679,-0.011657,0.000332,0.000878,-0.002812,-0.006343,4.4996
std,1.005203,1.009075,1.707802,1.004381,0.997901,0.994115,1.009734,1.005786,1.002352,0.988752,...,0.999945,0.999602,1.002058,0.999806,1.008513,1.004605,1.005839,1.005575,1.009058,2.86929
min,-3.491607,-3.639184,-6.912668,-4.472781,-3.616691,-4.173146,-3.667499,-3.944594,-4.30141,-3.511157,...,-3.529397,-3.913958,-3.703641,-3.65308,-3.781503,-3.681594,-3.836206,-3.469909,-3.629025,0.0
25%,-0.682893,-0.674533,-1.378398,-0.682673,-0.682458,-0.665803,-0.691063,-0.684564,-0.669468,-0.674767,...,-0.678632,-0.666935,-0.679963,-0.686076,-0.686592,-0.671482,-0.671581,-0.694552,-0.689322,2.0
50%,-0.014699,0.00462,-0.124142,0.003681,0.001497,-0.005867,-0.002502,-0.001999,-0.006615,-0.007863,...,-0.000708,0.001333,-0.017645,0.003059,-0.013765,0.011962,0.000576,-0.000231,-0.007669,4.0
75%,0.673137,0.682097,1.024628,0.670774,0.654282,0.66007,0.679423,0.669158,0.679411,0.651704,...,0.664737,0.670974,0.662086,0.682546,0.663479,0.675082,0.67646,0.697289,0.66157,7.0
max,3.60037,3.672114,5.345042,3.653042,4.694473,3.958176,3.568655,4.100342,4.616726,3.806026,...,3.820403,4.186634,3.791336,3.260061,3.432086,4.253332,3.980531,3.667844,4.010534,9.0


In [14]:
print(df.shape)
print(df.dtypes)
print("missing values in df: \n",sum(df.isna().sum()))

(10000, 51)
feature_0     float64
feature_1     float64
feature_2     float64
feature_3     float64
feature_4     float64
feature_5     float64
feature_6     float64
feature_7     float64
feature_8     float64
feature_9     float64
feature_10    float64
feature_11    float64
feature_12    float64
feature_13    float64
feature_14    float64
feature_15    float64
feature_16    float64
feature_17    float64
feature_18    float64
feature_19    float64
feature_20    float64
feature_21    float64
feature_22    float64
feature_23    float64
feature_24    float64
feature_25    float64
feature_26    float64
feature_27    float64
feature_28    float64
feature_29    float64
feature_30    float64
feature_31    float64
feature_32    float64
feature_33    float64
feature_34    float64
feature_35    float64
feature_36    float64
feature_37    float64
feature_38    float64
feature_39    float64
feature_40    float64
feature_41    float64
feature_42    float64
feature_43    float64
feature_44    float6

In [15]:
X = df.drop(["label"], axis = 1)
y = df.loc[:, "label"].values

In [16]:
# Split Data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [17]:
#grid search in training phase by considering cross valodation
kernel_trick = ["linear", "rbf", "sigmoid", "poly"]
C = [0.1, 1, 10, 100]
for i in C:
  print("C=", i)
  for j in kernel_trick:
    svc = SVC(kernel=j, C=i, gamma='scale')
    scores = cross_val_score(svc, X_train[:1000], y_train[:1000], cv=5,
                             scoring="accuracy")
    print("Kernel = ", j, "Average_accuracy =", scores.mean())
  print("--------------------------------------------------------")

C= 0.1
Kernel =  linear Average_accuracy = 0.471
Kernel =  rbf Average_accuracy = 0.10800000000000001
Kernel =  sigmoid Average_accuracy = 0.422
Kernel =  poly Average_accuracy = 0.10800000000000001
--------------------------------------------------------
C= 1
Kernel =  linear Average_accuracy = 0.41
Kernel =  rbf Average_accuracy = 0.51
Kernel =  sigmoid Average_accuracy = 0.487
Kernel =  poly Average_accuracy = 0.442
--------------------------------------------------------
C= 10
Kernel =  linear Average_accuracy = 0.392
Kernel =  rbf Average_accuracy = 0.495
Kernel =  sigmoid Average_accuracy = 0.45599999999999996
Kernel =  poly Average_accuracy = 0.442
--------------------------------------------------------
C= 100
Kernel =  linear Average_accuracy = 0.398
Kernel =  rbf Average_accuracy = 0.495
Kernel =  sigmoid Average_accuracy = 0.41600000000000004
Kernel =  poly Average_accuracy = 0.442
--------------------------------------------------------


In [18]:
# training phase
start_tra = perf_counter()
svc = SVC(kernel='rbf', C=1)
svc.fit(X_train, y_train)
end_tra = perf_counter()

In [19]:
# Accuracy on Train
train_score_SVC = round(svc.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_SVC)

# Accuracy on Test
test_score_SVC = round(svc.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_SVC)

#training time
train_time_SVC = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_SVC)

The Training Accuracy is:  0.84
The Testing Accuracy is:  0.63
train phase time:  4.4729


#Naive Bayes

In [20]:
# training phase
start_tra = perf_counter()
nb = GaussianNB()
nb.fit(X_train, y_train)
end_tra = perf_counter()

In [21]:
# Accuracy on Train
train_score_NB = round(nb.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_NB)

# Accuracy on Test
test_score_NB = round(nb.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_NB)

#training time
train_time_NB = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_NB)

The Training Accuracy is:  0.56
The Testing Accuracy is:  0.52
train phase time:  0.0216


#SVM + Mean

In [22]:
df_nullified = df.copy()
# Assuming df is your DataFrame and 'label' is your target column.

# Set the fraction of data you want to replace with NaN (e.g., 30%)
fraction_of_missing = 0.3

# Columns where you want to introduce NaN values (except 'label')
columns_to_have_nans = [col for col in df_nullified.columns if col != 'label']

for col in columns_to_have_nans:
    # Number of elements to replace per column
    n_elements_to_replace = int(len(df_nullified[col]) * fraction_of_missing)

    # Randomly choose indices to NaN out
    indices_to_replace = random.sample(range(df_nullified[col].count()), k=n_elements_to_replace)

    # Replace data with NaN
    df_nullified.loc[indices_to_replace, col] = np.nan

In [23]:
df_nullified.describe(include='all')

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,label
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,...,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,10000.0
mean,-0.00882,0.008236,-0.210363,0.001495,-0.015346,-0.004217,0.001939,0.000214,-0.013271,0.004009,...,0.008898,0.002049,-0.008176,0.00124,-0.008774,0.0029,-0.003414,0.000385,-0.001225,4.4996
std,1.00095,1.016084,1.723167,1.005203,0.994744,0.999077,1.003869,1.009413,1.003152,0.99119,...,0.995194,0.999689,1.008917,1.001707,1.003029,1.009839,0.995127,1.007316,1.005492,2.86929
min,-3.491607,-3.639184,-6.912668,-4.472781,-3.351793,-4.173146,-3.667499,-3.944594,-3.879069,-3.511157,...,-3.529397,-3.913958,-3.328572,-3.65308,-3.781503,-3.681594,-3.836206,-3.469909,-3.551747,0.0
25%,-0.687563,-0.677249,-1.407991,-0.688527,-0.684827,-0.663789,-0.685625,-0.697768,-0.668757,-0.673149,...,-0.668757,-0.67627,-0.685679,-0.697551,-0.680245,-0.67363,-0.672016,-0.693774,-0.687316,2.0
50%,-0.012262,-0.001462,-0.133656,0.013796,0.005219,1.3e-05,-0.002502,-0.007651,-0.013388,0.003992,...,-0.001719,-0.007371,-0.030615,-0.001108,-0.01597,0.024934,-0.002717,-0.004945,-0.007669,4.0
75%,0.671898,0.682585,1.031289,0.684532,0.636783,0.664695,0.67974,0.673613,0.679036,0.664235,...,0.671655,0.65953,0.655884,0.682546,0.664419,0.68554,0.666123,0.698627,0.661363,7.0
max,3.448158,3.672114,5.345042,3.566419,3.422754,3.958176,3.568655,4.100342,4.616726,3.806026,...,3.820403,4.186634,3.791336,3.260061,3.432086,4.253332,3.980531,3.667844,4.010534,9.0


In [24]:
print("missing values in df_nullified: \n",sum(df_nullified.isna().sum()))

missing values in df_nullified: 
 150000


In [25]:
# Imputing missing values by mean
df_imputed_mean = df_nullified.copy()
for column in df_imputed_mean.columns:
  df_imputed_mean[column].fillna(df_imputed_mean[column].mean(), inplace=True)

In [26]:
print("missing values in df4_imputed_mean: \n",sum(df_imputed_mean.isna().sum()))

missing values in df4_imputed_mean: 
 0


In [27]:
X = df_imputed_mean.drop(["label"], axis = 1)
y = df_imputed_mean.loc[:, "label"].values

In [28]:
# Split Data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [32]:
#grid search in training phase by considering cross valodation
kernel_trick = ["linear", "rbf", "sigmoid", "poly"]
C = [0.1, 1, 2, 3, 4, 5]
for i in C:
  print("C=", i)
  for j in kernel_trick:
    svc_mean = SVC(kernel=j, C=i, gamma='scale')
    scores = cross_val_score(svc_mean, X_train[:1000], y_train[:1000], cv=5,
                             scoring="accuracy")
    print("Kernel = ", j, "Average_accuracy =", scores.mean())
  print("--------------------------------------------------------")

C= 0.1
Kernel =  linear Average_accuracy = 0.32300000000000006
Kernel =  rbf Average_accuracy = 0.10800000000000001
Kernel =  sigmoid Average_accuracy = 0.285
Kernel =  poly Average_accuracy = 0.10800000000000001
--------------------------------------------------------
C= 1
Kernel =  linear Average_accuracy = 0.294
Kernel =  rbf Average_accuracy = 0.33899999999999997
Kernel =  sigmoid Average_accuracy = 0.311
Kernel =  poly Average_accuracy = 0.297
--------------------------------------------------------
C= 2
Kernel =  linear Average_accuracy = 0.281
Kernel =  rbf Average_accuracy = 0.33199999999999996
Kernel =  sigmoid Average_accuracy = 0.315
Kernel =  poly Average_accuracy = 0.29500000000000004
--------------------------------------------------------
C= 3
Kernel =  linear Average_accuracy = 0.279
Kernel =  rbf Average_accuracy = 0.337
Kernel =  sigmoid Average_accuracy = 0.321
Kernel =  poly Average_accuracy = 0.292
--------------------------------------------------------
C= 4
Kerne

In [33]:
# training phase
start_tra = perf_counter()
svc_mean = SVC(kernel='rbf', C=1)
svc_mean.fit(X_train, y_train)
end_tra = perf_counter()

In [34]:
# Accuracy on Train
train_score_SVC_mean = round(svc_mean.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_SVC_mean)

# Accuracy on Test
test_score_SVC_mean = round(svc_mean.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_SVC_mean)

#training time
train_time_SVC_mean = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_SVC_mean)

The Training Accuracy is:  0.75
The Testing Accuracy is:  0.42
train phase time:  7.4288


#Naive Bayes + Mean

In [36]:
# training phase
start_tra = perf_counter()
nb_mean = GaussianNB()
nb_mean.fit(X_train, y_train)
end_tra = perf_counter()

In [37]:
# Accuracy on Train
train_score_NB_mean = round(nb_mean.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_NB_mean)

# Accuracy on Test
test_score_NB_mean = round(nb_mean.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_NB_mean)

#training time
train_time_NB_mean = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_NB_mean)

The Training Accuracy is:  0.43
The Testing Accuracy is:  0.39
train phase time:  0.0169


# SVM + KNN

In [38]:
# for imputing by KNN, in the first step, we should scale the dataset because the KNN is sensitive to different scope
mms = MinMaxScaler()

for col in df.columns:
  if col != 'label':
    df_nullified[col] = mms.fit_transform(df_nullified[[col]]).squeeze()

In [39]:
# Imputing missing values by KNN Imputer

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# This will return a numpy array
imputed_data = imputer.fit_transform(df_nullified)

# If we want to keep the DataFrame format:
df_imputed_KNN = pd.DataFrame(data=imputed_data, columns=df_nullified.columns)

In [40]:
print("missing values in df_imputed_KNN: \n",sum(df_imputed_KNN.isna().sum()))

missing values in df_imputed_KNN: 
 0


In [41]:
X = df_imputed_KNN.drop(["label"], axis = 1)
y = df_imputed_KNN.loc[:, "label"].values

In [42]:
# Split Data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [43]:
#grid search in training phase by considering cross valodation
kernel_trick = ["linear", "rbf", "sigmoid", "poly"]
C = [0.1, 0.5, 0.5, 1, 1.5]
for i in C:
  print("C=", i)
  for j in kernel_trick:
    svc_KNN = SVC(kernel=j, C=i, gamma='scale')
    scores = cross_val_score(svc_KNN, X_train[:1000], y_train[:1000], cv=5,
                             scoring="accuracy")
    print("Kernel = ", j, "Average_accuracy =", scores.mean())
  print("--------------------------------------------------------")

C= 0.1
Kernel =  linear Average_accuracy = 0.182
Kernel =  rbf Average_accuracy = 0.10800000000000001
Kernel =  sigmoid Average_accuracy = 0.10800000000000001
Kernel =  poly Average_accuracy = 0.458
--------------------------------------------------------
C= 0.5
Kernel =  linear Average_accuracy = 0.502
Kernel =  rbf Average_accuracy = 0.499
Kernel =  sigmoid Average_accuracy = 0.10800000000000001
Kernel =  poly Average_accuracy = 0.45600000000000007
--------------------------------------------------------
C= 0.5
Kernel =  linear Average_accuracy = 0.502
Kernel =  rbf Average_accuracy = 0.499
Kernel =  sigmoid Average_accuracy = 0.10800000000000001
Kernel =  poly Average_accuracy = 0.45600000000000007
--------------------------------------------------------
C= 1
Kernel =  linear Average_accuracy = 0.502
Kernel =  rbf Average_accuracy = 0.515
Kernel =  sigmoid Average_accuracy = 0.10800000000000001
Kernel =  poly Average_accuracy = 0.45600000000000007
-----------------------------------

In [44]:
# training phase
start_tra = perf_counter()
svc_KNN = SVC(kernel='rbf', C=1.5)
svc_KNN.fit(X_train, y_train)
end_tra = perf_counter()

In [45]:
# Accuracy on Train
train_score_SVC_KNN = round(svc_KNN.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_SVC_KNN)

# Accuracy on Test
test_score_SVC_KNN = round(svc_KNN.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_SVC_KNN)

#training time
train_time_SVC_KNN = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_SVC_KNN)

The Training Accuracy is:  0.92
The Testing Accuracy is:  0.66
train phase time:  8.2478


# Naive Bayes + KNN

In [47]:
# training phase
start_tra = perf_counter()
nb_KNN = GaussianNB()
nb_KNN.fit(X_train, y_train)
end_tra = perf_counter()

In [48]:
# Accuracy on Train
train_score_NB_KNN = round(nb_KNN.score(X_train, y_train), 2)
print("The Training Accuracy is: ", train_score_NB_KNN)

# Accuracy on Test
test_score_NB_KNN = round(nb_KNN.score(X_test, y_test), 2)
print("The Testing Accuracy is: ", test_score_NB_KNN)

#training time
train_time_NB_KNN = round((end_tra-start_tra), 4)
print(f'train phase time: ', train_time_NB_KNN)

The Training Accuracy is:  0.64
The Testing Accuracy is:  0.61
train phase time:  0.0163


# Conclusion

In [49]:
Conclusion = pd.DataFrame({'Models :':['SVM', 'Naive Bayes', 'SVM + Mean', 'Naive Bayes + Mean', 'SVM + KNN', 'Naive Bayes + KNN'],
                          'train acc'               :[train_score_SVC,train_score_NB,  train_score_SVC_mean,train_score_NB_mean, train_score_SVC_KNN,  train_score_NB_KNN],
                          'test acc'                :[test_score_SVC, test_score_NB,  test_score_SVC_mean,  test_score_NB_mean,   test_score_SVC_KNN,  test_score_NB_KNN],
                          'training time'           :[train_time_SVC, train_time_NB,  train_time_SVC_mean,  train_time_NB_mean,   train_time_SVC_KNN,  train_time_NB_KNN]})

Conclusion

Unnamed: 0,Models :,train acc,test acc,training time
0,SVM,0.84,0.63,4.4729
1,Naive Bayes,0.56,0.52,0.0216
2,SVM + Mean,0.75,0.42,7.4288
3,Naive Bayes + Mean,0.43,0.39,0.0169
4,SVM + KNN,0.92,0.66,8.2478
5,Naive Bayes + KNN,0.64,0.61,0.0163
