In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing

train_file = '/kaggle/input/digit-recognizer/train.csv'
test_file = '/kaggle/input/digit-recognizer/test.csv'

# Read input files

In [2]:
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [3]:
df_train.describe()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,0.0,0.0,0.0,0.0
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [4]:
df_test.describe()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,...,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.164607,0.073214,0.028036,0.01125,0.006536,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.473293,3.616811,1.813602,1.205211,0.807475,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,253.0,254.0,193.0,187.0,119.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_train.shape, df_test.shape

((42000, 785), (28000, 784))

# Extract X and y, and split the dataset

In [6]:
# train dataset
X_train = df_train.drop("label", axis=1).values
y_train = df_train["label"].values

# test dataset
X_test = df_test.values

# preprocess X values
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# split train into partial train and holdout (to run test on train dataset)
X_train_partial, X_holdout, y_train_partial, y_holdout = train_test_split(X_train, y_train, train_size=.75, random_state=99)

# Dimensionality reduction usind PCA

In [7]:
# fit initial PCA model
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)

# find optimal K that keeps 80% of the components
evs = pca.explained_variance_ratio_
evs_sum = 0
keep = 0.9
for k, ev in enumerate(evs):
    evs_sum += ev
    if evs_sum >= keep:
        break
print('K =', k)

# fit PCA model
n_components = k
pca = PCA(n_components=n_components)
pca.fit(X_train)

K = 228


PCA(n_components=228)

In [8]:
# transform the dataset
Theta_train_partial = pca.transform(X_train_partial)
Theta_holdout = pca.transform(X_holdout)
Theta_train = pca.transform(X_train)
Theta_test = pca.transform(X_test)

# Find the optimal hyper parameters

In [9]:
Cs = [0.5,1,5,10,15,20,30]
kernels = ['rbf','poly']
gammas = ['scale', 'auto']
results = {'error': 1.}

for kernel in kernels:
    for gamma in gammas:
        for C in Cs:
            
            # train with poly kernel
            model = svm.SVC(kernel=kernel, gamma=gamma, C=C)
            model.fit(Theta_train_partial, y_train_partial)
            error = 1 - model.score(Theta_holdout, y_holdout)
            print(f'Validating: kernel = {kernel}, gamma = {gamma}, C = {C}, error = {error}')
            
            # store the best C
            if error < results['error']:
                results = {'kernel': kernel, 'gamma': gamma, 'C': C, 'error': error}


Validating: kernel = rbf, gamma = scale, C = 0.5, error = 0.04390476190476189
Validating: kernel = rbf, gamma = scale, C = 1, error = 0.037047619047618996
Validating: kernel = rbf, gamma = scale, C = 5, error = 0.031333333333333324
Validating: kernel = rbf, gamma = scale, C = 10, error = 0.030761904761904768
Validating: kernel = rbf, gamma = scale, C = 15, error = 0.031714285714285695
Validating: kernel = rbf, gamma = scale, C = 20, error = 0.03142857142857147
Validating: kernel = rbf, gamma = scale, C = 30, error = 0.03161904761904766
Validating: kernel = rbf, gamma = auto, C = 0.5, error = 0.06647619047619047
Validating: kernel = rbf, gamma = auto, C = 1, error = 0.05047619047619045
Validating: kernel = rbf, gamma = auto, C = 5, error = 0.047904761904761894
Validating: kernel = rbf, gamma = auto, C = 10, error = 0.04819047619047623
Validating: kernel = rbf, gamma = auto, C = 15, error = 0.04847619047619045
Validating: kernel = rbf, gamma = auto, C = 20, error = 0.04847619047619045
Va

In [10]:
# retrain the model using the whole training dataset and the best parameters
model = svm.SVC(kernel=results['kernel'], gamma=results['gamma'], C=results['C'])
model.fit(Theta_train, y_train)

SVC(C=20, kernel='poly')

In [11]:
# calculate accuracy on tain dataset
y_predict = model.predict(Theta_train)
accuracy = metrics.accuracy_score(y_train, y_predict)
print(f'Accuracy = {accuracy:.3f}%')

Accuracy = 0.998%


# Calculate Results for Submission

In [12]:
# predict for the transformed test dataset
y_test = model.predict(Theta_test)
df_result = pd.DataFrame(y_test, columns=['Label'], index=np.arange(1,28001))
df_result.to_csv('subimission.csv', index_label='ImageId')