In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file, load_svmlight_files
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from src.prepare import *

In [2]:
#config
DATA_URL_TRAIN = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a"
DATA_URL_TEST = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t"

DATA_DIR = "data/"

In [3]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

Download and store train and test dataset

Dataset information : 

- Source: UCI / Adult
- Preprocessing: The original Adult data set has 14 features, among which six are continuous and eight are categorical. In this data set, continuous features are discretized into quantiles, and each quantile is represented by a binary feature. Also, a categorical feature with m categories is converted to m binary features. 
- Number of classes: 2
- Number of data: 32,561 / 16,281 (testing)
- Number of features: 123 / 123 (testing) 

In [4]:
download_data(DATA_URL_TRAIN, os.path.join(DATA_DIR, "train_data.txt"))

In [5]:
download_data(DATA_URL_TEST, os.path.join(DATA_DIR, "test_data.txt"))

Read and parse the dataset

In [6]:
X_train, y_train, X_test, y_test = load_svmlight_files(
    (os.path.join(DATA_DIR, "train_data.txt"), os.path.join(DATA_DIR, "test_data.txt"))
)

print(f"Train data shape : {X_train.shape}, Train label shape : {y_train.shape}"
      f"\nTest data shape : {X_test.shape}, Test label shape : {y_test.shape}")


Train data shape : (32561, 123), Train label shape : (32561,)
Test data shape : (16281, 123), Test label shape : (16281,)


Train the SVC with the linear kernel

In [7]:
clf_linear_default = SVC(kernel='linear', random_state=1)
# show default parameters
print("Default Parameters : ")
pd.DataFrame([clf_linear_default.get_params()]).T

Default Parameters : 


Unnamed: 0,0
C,1.0
break_ties,False
cache_size,200
class_weight,
coef0,0.0
decision_function_shape,ovr
degree,3
gamma,scale
kernel,linear
max_iter,-1


In [8]:
C = [0.01, 0.05, 0.1, 0.5, 1]
result_dict_linear = {}

for c in C:
    #train model and apply 3-fold cross validation
    clf_linear = SVC(kernel='linear', C=c, random_state=1)
    scores = cross_val_score(clf_linear, X_train, y_train, cv=3, scoring='accuracy')
    result_dict_linear[c] = np.mean(scores)

df_result_linear = pd.DataFrame([result_dict_linear]).T.reset_index().rename(columns={"index":"C", 0:"Accuracy"})
df_result_linear.to_csv(os.path.join(DATA_DIR, "linear_kernel_results.csv"), index=False)
df_result_linear

Unnamed: 0,C,Accuracy
0,0.01,0.844016
1,0.05,0.846104
2,0.1,0.846442
3,0.5,0.846934
4,1.0,0.84721


Train the SVC with the rbf kernel

In [9]:
clf_rbf_default = SVC(kernel='rbf', random_state=1)
# show default parameters
print("Default Parameters : ")
pd.DataFrame([clf_rbf_default.get_params()]).T

Default Parameters : 


Unnamed: 0,0
C,1.0
break_ties,False
cache_size,200
class_weight,
coef0,0.0
decision_function_shape,ovr
degree,3
gamma,scale
kernel,rbf
max_iter,-1


In [25]:
gamma = [0.01, 0.05, 0.1, 0.5, 1, 2]
C = [0.01, 0.05, 0.1, 0.5, 1]

result_dict_rbf = {}

for g in gamma:
    result_dict_rbf[g] = {}
    for c in C:
        #train model and apply 3-fold cross validation
        clf_rbf = SVC(kernel='rbf', gamma=g, C=c, random_state=1)
        scores = cross_val_score(clf_rbf, X_train, y_train, cv=3, scoring='accuracy')
        result_dict_rbf[g][c] = np.mean(scores)

df_result_rbf = pd.DataFrame(result_dict_rbf).rename_axis("C").rename_axis("Gamma", axis=1)
df_result_rbf.to_csv(os.path.join(DATA_DIR, "rbf_kernel_results.csv"))
df_result_rbf

Gamma,0.01,0.05,0.10,0.50,1.00,2.00
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.01,0.75919,0.819907,0.819846,0.75919,0.75919,0.75919
0.05,0.831209,0.835755,0.83425,0.789165,0.75919,0.75919
0.1,0.83772,0.839655,0.838764,0.806118,0.761985,0.75919
0.5,0.842972,0.845766,0.846811,0.832161,0.789748,0.769295
1.0,0.844415,0.846749,0.847425,0.836614,0.798286,0.777587
