# Building an SVM Classifier for MNIST with Hyperparameter Tuning and Comparative Analysis

In [8]:
# Python ≥3.8 required
import sys
assert sys.version_info >= (3, 8)

# Scikit-Learn ≥1.2 required
import sklearn
from packaging import version
assert version.parse(sklearn.__version__) >= version.parse("1.2.0")

import numpy as np
import pandas as pd
import time

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import loguniform

np.random.seed(42)

## Load MNIST
This cell downloads MNIST from OpenML (first run requires internet). X has shape (70000, 784), y has shape (70000,).

In [10]:
# Fetch MNIST (requires internet on the first run)
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
X, y = mnist['data'], mnist['target'].astype(np.uint8)


X_train_full, X_test, y_train_full, y_test = X[:60000], X[60000:], y[:60000], y[60000:]



X_train_full, _, y_train_full, _ = train_test_split(X_train_full, y_train_full, train_size=N_TRAIN, stratify=y_train_full, random_state=42)

# Create a validation split from the training set
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=10000, stratify=y_train_full, random_state=42
)

print(X_train.shape, X_valid.shape, X_test.shape)

(35000, 784) (10000, 784) (10000, 784)


In [11]:
def fit_and_time(model, X, y):
    t0 = time.time()
    model.fit(X, y)
    t1 = time.time()
    return t1 - t0, model

def evaluate(model, X_train, y_train, X_test, y_test, average='macro'):
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    metrics = {
        'train_accuracy': accuracy_score(y_train, y_pred_train),
        'test_accuracy': accuracy_score(y_test, y_pred_test),
        'precision': precision_score(y_test, y_pred_test, average=average, zero_division=0),
        'recall': recall_score(y_test, y_pred_test, average=average, zero_division=0),
        'f1': f1_score(y_test, y_pred_test, average=average, zero_division=0),
    }
    return metrics

def print_report(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))

## SVM (Linear Kernel)

In [12]:
linear_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('svm', LinearSVC(C=1.0, max_iter=2000, dual=True, random_state=42))
])

train_time, linear_model = fit_and_time(linear_pipeline, X_train, y_train)
linear_metrics = evaluate(linear_model, X_train, y_train, X_test, y_test)
print('Linear SVM training time (s):', round(train_time, 2))
print(pd.Series(linear_metrics).round(4))



Linear SVM training time (s): 183.89
train_accuracy    0.9295
test_accuracy     0.9067
precision         0.9056
recall            0.9058
f1                0.9054
dtype: float64


## SVM (Polynomial Kernel) + Grid Search

In [13]:
poly_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('svm', SVC(kernel='poly', probability=False, random_state=42))
])

param_grid_poly = {
    'svm__C': [0.1, 1, 10],
    'svm__degree': [2, 3, 4],
    'svm__coef0': [0.0, 0.5, 1.0],
}

grid_poly = GridSearchCV(poly_pipeline, param_grid_poly, cv=3, n_jobs=-1, verbose=1)
train_time_poly, grid_poly = fit_and_time(grid_poly, X_train, y_train)

best_poly = grid_poly.best_estimator_
poly_metrics = evaluate(best_poly, X_train, y_train, X_test, y_test)

print('Best Polynomial SVM params:', grid_poly.best_params_)
print('Poly SVM GridSearch training time (s):', round(train_time_poly, 2))
print(pd.Series(poly_metrics).round(4))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Polynomial SVM params: {'svm__C': 10, 'svm__coef0': 0.0, 'svm__degree': 2}
Poly SVM GridSearch training time (s): 6630.69
train_accuracy    0.9972
test_accuracy     0.9730
precision         0.9730
recall            0.9727
f1                0.9728
dtype: float64


## SVM (RBF Kernel) + Randomized Search

In [14]:
rbf_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('svm', SVC(kernel='rbf', probability=False, random_state=42))
])

param_dist_rbf = {
    'svm__C': loguniform(1e-3, 1e3),
    'svm__gamma': loguniform(1e-4, 1e-1),
}

rnd_rbf = RandomizedSearchCV(
    rbf_pipeline, param_distributions=param_dist_rbf,
    n_iter=10, cv=3, n_jobs=-1, verbose=1, random_state=42
)
train_time_rbf, rnd_rbf = fit_and_time(rnd_rbf, X_train, y_train)

best_rbf = rnd_rbf.best_estimator_
rbf_metrics = evaluate(best_rbf, X_train, y_train, X_test, y_test)

print('Best RBF SVM params:', rnd_rbf.best_params_)
print('RBF SVM RandomizedSearch training time (s):', round(train_time_rbf, 2))
print(pd.Series(rbf_metrics).round(4))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best RBF SVM params: {'svm__C': np.float64(98.77700294007911), 'svm__gamma': np.float64(0.0004335281794951569)}
RBF SVM RandomizedSearch training time (s): 10472.86
train_accuracy    0.9999
test_accuracy     0.9656
precision         0.9655
recall            0.9651
f1                0.9653
dtype: float64


## Baseline Classifiers

In [16]:
# KNN
knn_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier(n_neighbors=3, n_jobs=-1))
])
train_time_knn, knn_model = fit_and_time(knn_pipeline, X_train, y_train)
knn_metrics = evaluate(knn_model, X_train, y_train, X_test, y_test)
print('KNN training time (s):', round(train_time_knn, 2))
print(pd.Series(knn_metrics).round(4))

# SGD (linear classifier)
sgd_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('sgd', SGDClassifier(loss='hinge', alpha=1e-4, max_iter=2000, random_state=42, n_jobs=-1))
])
train_time_sgd, sgd_model = fit_and_time(sgd_pipeline, X_train, y_train)
sgd_metrics = evaluate(sgd_model, X_train, y_train, X_test, y_test)
print('SGD training time (s):', round(train_time_sgd, 2))
print(pd.Series(sgd_metrics).round(4))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
train_time_rf, rf_model = fit_and_time(rf_model, X_train, y_train)
rf_metrics = evaluate(rf_model, X_train, y_train, X_test, y_test)
print('Random Forest training time (s):', round(train_time_rf, 2))
print(pd.Series(rf_metrics).round(4))

KNN training time (s): 1.26
train_accuracy    0.9668
test_accuracy     0.9384
precision         0.9389
recall            0.9375
f1                0.9378
dtype: float64
SGD training time (s): 34.63
train_accuracy    0.9173
test_accuracy     0.8904
precision         0.8912
recall            0.8885
f1                0.8887
dtype: float64
Random Forest training time (s): 28.05
train_accuracy    1.0000
test_accuracy     0.9642
precision         0.9641
recall            0.9639
f1                0.9640
dtype: float64


## Summary of Results

In [17]:
summary = pd.DataFrame([
    {'model':'SVM Linear', 'train_time_s': round(train_time, 2), **linear_metrics},
    {'model':'SVM Poly (best)', 'train_time_s': round(train_time_poly, 2), **poly_metrics},
    {'model':'SVM RBF (best)', 'train_time_s': round(train_time_rbf, 2), **rbf_metrics},
    {'model':'KNN', 'train_time_s': round(train_time_knn, 2), **knn_metrics},
    {'model':'SGD', 'train_time_s': round(train_time_sgd, 2), **sgd_metrics},
    {'model':'RandomForest', 'train_time_s': round(train_time_rf, 2), **rf_metrics},
])
summary = summary[['model','train_time_s','train_accuracy','test_accuracy','precision','recall','f1']]
summary.sort_values(by='test_accuracy', ascending=False, inplace=True)
summary.reset_index(drop=True, inplace=True)
summary

Unnamed: 0,model,train_time_s,train_accuracy,test_accuracy,precision,recall,f1
0,SVM Poly (best),6630.69,0.997171,0.973,0.972957,0.972706,0.97281
1,SVM RBF (best),10472.86,0.999886,0.9656,0.965537,0.965119,0.965284
2,RandomForest,28.05,1.0,0.9642,0.96411,0.963919,0.963981
3,KNN,1.26,0.966771,0.9384,0.938879,0.937488,0.937834
4,SVM Linear,183.89,0.929543,0.9067,0.905637,0.905776,0.905366
5,SGD,34.63,0.917314,0.8904,0.891188,0.888486,0.888695


## Detailed Classification Report (Best Model on Test Set)

In [20]:
print("hej")
best_row = summary.iloc[0]
best_name = best_row['model']

name_to_model = {
    'SVM Linear': linear_model,
    'SVM Poly (best)': best_poly,
    'SVM RBF (best)': best_rbf,
    'KNN': knn_model,
    'SGD': sgd_model,
    'RandomForest': rf_model,
}

best_model = name_to_model[best_name]
print('Best model:', best_name)
print_report(best_model, X_test, y_test)

hej
Best model: SVM Poly (best)
              precision    recall  f1-score   support

           0     0.9788    0.9908    0.9848       980
           1     0.9766    0.9921    0.9843      1135
           2     0.9729    0.9729    0.9729      1032
           3     0.9725    0.9792    0.9758      1010
           4     0.9685    0.9695    0.9690       982
           5     0.9740    0.9664    0.9702       892
           6     0.9811    0.9739    0.9775       958
           7     0.9802    0.9640    0.9720      1028
           8     0.9610    0.9610    0.9610       974
           9     0.9641    0.9574    0.9607      1009

    accuracy                         0.9730     10000
   macro avg     0.9730    0.9727    0.9728     10000
weighted avg     0.9730    0.9730    0.9730     10000

