# Exercise 2.4
Author: Sebastian Pritz

## Preparation

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.svm import SVC

from pyMLaux import plot_2d_prediction

In [3]:
data_dir = "src/"

## Normal Data

In [4]:
# Read in the data
data = pd.read_csv(data_dir+"Dataset1.csv")
data

Unnamed: 0,x1,x2,y
0,0.914806,0.885118,1
1,0.937075,0.517111,1
2,0.286140,0.851931,0
3,0.830448,0.442796,1
4,0.641746,0.157880,1
...,...,...,...
195,0.053129,0.490750,0
196,0.531874,0.965256,1
197,0.112308,0.906942,0
198,0.743188,0.551251,1


In [59]:
def apply_grid_search(X, y):
    # Calc split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1337)

    # Apply gridsearch
    param_grid = [
      {'C': [1, 4, 16, 64, 256], 'kernel': ['linear']},
      {'C': [1, 4, 16, 64, 256], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
     ]

    svc_grid_search = GridSearchCV(SVC(), param_grid, cv=10, scoring='accuracy', n_jobs=-1)
    svc_grid_search.fit(X_train, y_train)
    
    return svc_grid_search.best_score_, svc_grid_search.best_estimator_, pd.DataFrame(svc_grid_search.cv_results_)

In [62]:
score, parameters, cv_results = apply_grid_search(data.iloc[:,:-1], data.iloc[:,-1])
print(score, parameters)
cv_results

0.9071428571428571 SVC(C=4, kernel='linear')


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001901,0.0003742159,0.0009,0.0003747968,1,linear,,"{'C': 1, 'kernel': 'linear'}",0.928571,0.857143,...,0.785714,0.857143,0.928571,0.857143,0.857143,1.0,0.857143,0.892857,0.065854,9
1,0.001616,0.0003188032,0.000953,0.0003530398,4,linear,,"{'C': 4, 'kernel': 'linear'}",0.928571,0.857143,...,0.857143,0.857143,0.928571,0.857143,0.928571,1.0,0.857143,0.907143,0.055787,1
2,0.001601,0.0003000077,0.000899,0.0002998639,16,linear,,"{'C': 16, 'kernel': 'linear'}",0.928571,0.857143,...,0.857143,0.857143,0.928571,0.857143,0.928571,1.0,0.857143,0.9,0.04738,7
3,0.0013,0.0003317908,0.00085,0.0002286469,64,linear,,"{'C': 64, 'kernel': 'linear'}",0.928571,0.928571,...,0.857143,0.857143,0.928571,0.857143,0.928571,1.0,0.857143,0.907143,0.045737,1
4,0.002149,0.000319795,0.001,8.996946e-07,256,linear,,"{'C': 256, 'kernel': 'linear'}",0.928571,0.928571,...,0.857143,0.857143,0.928571,0.857143,0.928571,1.0,0.857143,0.907143,0.045737,1
5,0.0019,0.0004353668,0.0009,0.000300058,1,rbf,0.01,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
6,0.00145,0.000150279,0.000751,0.0002509163,1,rbf,0.001,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
7,0.001602,0.0003738797,0.000951,0.0002683933,1,rbf,0.0001,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
8,0.00185,0.0002289581,0.001049,0.0003495156,4,rbf,0.01,"{'C': 4, 'gamma': 0.01, 'kernel': 'rbf'}",0.928571,0.857143,...,0.642857,0.857143,0.785714,0.785714,0.928571,0.785714,0.785714,0.814286,0.079539,11
9,0.001751,0.0004037353,0.001,0.0003871477,4,rbf,0.001,"{'C': 4, 'gamma': 0.001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12


## Noisy data

In [54]:
# Create noisy dataset
noisy_data = data.copy(deep=True)
for i in range(10):
    noisy_data[f"x{i+3}"] = np.random.uniform(size=noisy_data.shape[0])

cols = list(noisy_data.columns.values)
cols.pop(cols.index("y"))
noisy_data = noisy_data[cols + ["y"]]
noisy_data

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,y
0,0.914806,0.885118,0.654874,0.566485,0.741465,0.274348,0.411234,0.288198,0.815190,0.762923,0.952080,0.893468,1
1,0.937075,0.517111,0.878151,0.163102,0.673953,0.697528,0.612304,0.902049,0.567415,0.274501,0.434412,0.609641,1
2,0.286140,0.851931,0.014905,0.999931,0.569159,0.332122,0.861005,0.868259,0.202417,0.772833,0.826927,0.724725,0
3,0.830448,0.442796,0.231061,0.994449,0.484634,0.210943,0.470065,0.580222,0.183163,0.249513,0.295766,0.168151,1
4,0.641746,0.157880,0.778534,0.138109,0.544111,0.614616,0.786482,0.932671,0.589753,0.073445,0.748046,0.431293,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.053129,0.490750,0.584001,0.995700,0.620230,0.726955,0.517594,0.363721,0.910276,0.418493,0.340151,0.738222,0
196,0.531874,0.965256,0.193740,0.850563,0.921282,0.890952,0.952668,0.697984,0.768596,0.404544,0.405616,0.936134,1
197,0.112308,0.906942,0.970892,0.903322,0.899959,0.571127,0.570785,0.777716,0.953050,0.448917,0.818507,0.673018,0
198,0.743188,0.551251,0.596862,0.274259,0.460474,0.653878,0.060860,0.435061,0.831358,0.667371,0.579748,0.385065,1


In [63]:
score, parameters, cv_results = apply_grid_search(noisy_data.iloc[:,:-1], noisy_data.iloc[:,-1])
print(score, parameters)
cv_results

0.8928571428571429 SVC(C=256, gamma=0.01)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001949,0.000417,0.001,1e-06,1,linear,,"{'C': 1, 'kernel': 'linear'}",0.928571,0.857143,...,0.714286,0.785714,0.857143,0.857143,0.928571,1.0,0.857143,0.871429,0.076931,5
1,0.00145,0.000271,0.000893,0.000437,4,linear,,"{'C': 4, 'kernel': 'linear'}",0.928571,0.857143,...,0.785714,0.785714,0.928571,0.928571,0.928571,1.0,0.857143,0.885714,0.065465,2
2,0.001651,0.000232,0.00095,0.000269,16,linear,,"{'C': 16, 'kernel': 'linear'}",0.928571,0.928571,...,0.785714,0.714286,0.928571,0.928571,0.857143,1.0,0.857143,0.864286,0.092857,7
3,0.003701,0.001764,0.00105,0.00035,64,linear,,"{'C': 64, 'kernel': 'linear'}",0.928571,0.857143,...,0.785714,0.642857,0.928571,0.928571,0.857143,1.0,0.857143,0.857143,0.095831,9
4,0.0051,0.001337,0.001051,0.000271,256,linear,,"{'C': 256, 'kernel': 'linear'}",0.928571,0.928571,...,0.785714,0.714286,0.928571,0.928571,0.857143,1.0,0.857143,0.871429,0.083299,4
5,0.002051,0.000269,0.001299,0.000245,1,rbf,0.01,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
6,0.00185,0.000391,0.00115,0.00032,1,rbf,0.001,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
7,0.00185,0.000391,0.00105,0.000472,1,rbf,0.0001,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12
8,0.0018,0.000458,0.001,0.000224,4,rbf,0.01,"{'C': 4, 'gamma': 0.01, 'kernel': 'rbf'}",0.928571,0.857143,...,0.571429,0.857143,0.785714,0.714286,0.857143,0.785714,0.714286,0.778571,0.098198,11
9,0.0018,0.0004,0.00085,0.00032,4,rbf,0.001,"{'C': 4, 'gamma': 0.001, 'kernel': 'rbf'}",0.642857,0.642857,...,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.571429,0.592857,0.032733,12


## Observations

As can be seen in the tables above, for the normal data several methods were applicable and shared the highest accuracy of about 90.7%. This is not the case for noisy data, as all parameter configurations are at least 1.5% behind, down to 89.2% and rank 1 isn't shared. 