In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Dataset_CDS/Social_Network_Ads.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
df.drop(columns=['User ID'], inplace=True)

In [5]:
df.isnull().sum()

Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(20)

In [7]:
# Remove duplicate rows from the dataframe and modify the original dataframe
df.drop_duplicates(inplace=True)

In [8]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [9]:
# Split the dataset into features (x) and target variable (y)
# Remove the 'Purchased' column from features
x = df.drop(columns=['Purchased'])
# Set the target variable as the 'Purchased' column
y = df['Purchased']

In [10]:
from sklearn.model_selection import train_test_split
# - x: features/independent variables
# - y: target/dependent variable
# - random_state=13: set seed for reproducible results
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=13)

In [11]:
from sklearn.compose import ColumnTransformer  # Combines transformers for different column types
from sklearn.preprocessing import (
    OneHotEncoder,  # For converting categorical variables into binary vectors
    StandardScaler,  # For standardizing features by removing mean and scaling to unit variance
)

In [12]:
# Create a ColumnTransformer to preprocess different columns differently
trans = ColumnTransformer(transformers=[
    # Apply OneHotEncoder to 'Gender' column, converting categorical data to numerical
    # sparse_output=False returns a dense array instead of sparse matrix
    # drop='first' removes the first category to avoid multicollinearity
    ('tf1',OneHotEncoder(sparse_output=False,drop='first'),['Gender']),
    
    # Apply StandardScaler to 'Age' and 'EstimatedSalary' columns
    # This standardizes numerical features to have mean=0 and variance=1
    ('tf2',StandardScaler(),['Age','EstimatedSalary'])
],
# Keep any remaining columns unchanged
remainder= 'passthrough')

In [13]:
# scale only x_train and x_test. Do not scale y_train and y_test
x_train_scaled = trans.fit_transform(x_train) # Scale the training data using the transformer and fit it to this data
x_test_scaled = trans.transform(x_test) # Scale the test data using the same transformer (without fitting to avoid data leakage)

In [14]:
from sklearn.svm import SVC

# 1. Initialise the SVM model 
svm_model = SVC(C=1.0, # C=1.0: Regularization parameter. Controls the trade-off between decision boundary and misclassification
    kernel='rbf', # kernel='rbf': Radial Basis Function kernel, good for non-linear data
    degree=3, # degree=3: Degree of polynomial kernel function (only relevant when kernel='poly')
    gamma='scale', # gamma='scale': Kernel coefficient. 'scale' means 1/(n_features * X.var())
    random_state=42) # Uses default parameters with only random_state set for reproducibility

# 2. Train the model on the scaled training data. 
# The fit method learns the relationship between features (x_train_scaled) and target (y_train)
svm_model.fit(x_train_scaled, y_train)

In [15]:
# Generate predictions on the unseen testing data
y_pred_svm = svm_model.predict(x_test_scaled)

In [16]:
from sklearn.metrics import classification_report, accuracy_score

# Calculate and print overall accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Test Accuracy: {accuracy_svm:.4f}\n")

# Print the detailed Classification Report
print(classification_report(y_test, y_pred_svm))

SVM Test Accuracy: 0.9211

              precision    recall  f1-score   support

           0       0.98      0.89      0.93        46
           1       0.85      0.97      0.91        30

    accuracy                           0.92        76
   macro avg       0.91      0.93      0.92        76
weighted avg       0.93      0.92      0.92        76



In [17]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Define the steps for the pipeline (Scaler -> SVM)
pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    # We use SVC, the Support Vector Classifier
    ('svc', SVC(random_state=42)) 
])

In [18]:
# The hyperparameters to search for the SVM model
param_grid_svm = {
    # Testing different regularization strengths
    'svc__C': [0.1, 1, 10, 100], # C controls the penalty for misclassification - higher values mean stricter boundaries
    
    # Testing different influences for the RBF kernel
    'svc__gamma': [0.001, 0.01, 0.1, 1], 
    
    # Testing different kernel types
    'svc__kernel': ['rbf', 'linear'] 
    # rbf: Radial Basis Function for non-linear relationships
    # linear: For linearly separable data
}

In [19]:
from sklearn.model_selection import GridSearchCV

# Initialize Grid Search with the SVM pipeline and the parameter grid
grid_search_svm = GridSearchCV(
    estimator=pipe_svm, # The SVM pipeline to optimise
    param_grid=param_grid_svm, # Dictionary of parameters to search through
    cv=5,                 # Use 5-fold cross-validation
    scoring='accuracy',   # Optimise for overall accuracy
    n_jobs=-1             # Use all available CPU cores
) 

# Run the search on the training data (X_train and y_train)
# This will train multiple SVM models with different hyperparameters
# and find the best combination based on cross-validation performance
grid_search_svm.fit(x_train_scaled, y_train) 

In [20]:
# 1. Print the best hyperparameters found
# This displays the optimal combination of parameters that achieved the highest score
print("Best Hyperparameters Found:")
print(grid_search_svm.best_params_)

# 2. Print the score achieved by those parameters (CV score)
# This shows the mean cross-validation score achieved with the best parameters
print(f"\nBest Cross-Validation Score: {grid_search_svm.best_score_:.4f}")

Best Hyperparameters Found:
{'svc__C': 100, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

Best Cross-Validation Score: 0.8980


In [21]:
# Get the best model from grid search
# The best_estimator_ is the full Pipeline (Scaler + Tuned SVC)
best_svm_model = grid_search_svm.best_estimator_

# Use the best model to make predictions on the test set
# Generate predictions using the best model
y_pred_tuned_svm = best_svm_model.predict(x_test_scaled)

In [22]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate and print overall accuracy
final_accuracy_svm = accuracy_score(y_test, y_pred_tuned_svm)
print(f"\nFINAL TUNED SVM TEST ACCURACY: {final_accuracy_svm:.4f}")

# Print the detailed Classification Report

# Precision: ratio of correct positive predictions to total predicted positives
# Recall: ratio of correct positive predictions to all actual positives
# F1-score: harmonic mean of precision and recall
# Support: number of occurrences of each class in the test set
print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred_tuned_svm))


FINAL TUNED SVM TEST ACCURACY: 0.9211

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.98      0.89      0.93        46
           1       0.85      0.97      0.91        30

    accuracy                           0.92        76
   macro avg       0.91      0.93      0.92        76
weighted avg       0.93      0.92      0.92        76

