### Data Input

In [12]:
import pandas as pd

# some rows is broken, I fixed it by hand... because just double tab problem

data_df = pd.read_csv("../data/ionosphere.data", header=None)
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.00000,0.03760,...,-0.51171,0.41078,-0.46168,0.21266,-0.34090,0.42267,-0.54487,0.18641,-0.45300,g
1,1,0,1.00000,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.00000,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.19040,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.00000,-0.03365,1.00000,0.00485,1.00000,-0.12062,0.88965,0.01198,...,-0.40220,0.58984,-0.22145,0.43100,-0.17365,0.60436,-0.24180,0.56045,-0.38238,g
3,1,0,1.00000,-0.45161,1.00000,1.00000,0.71216,-1.00000,0.00000,0.00000,...,0.90695,0.51613,1.00000,1.00000,-0.20099,0.25682,1.00000,-0.32382,1.00000,b
4,1,0,1.00000,-0.02401,0.94140,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.13290,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,1,0,0.83508,0.08298,0.73739,-0.14706,0.84349,-0.05567,0.90441,-0.04622,...,-0.04202,0.83479,0.00123,1.00000,0.12815,0.86660,-0.10714,0.90546,-0.04307,g
347,1,0,0.95113,0.00419,0.95183,-0.02723,0.93438,-0.01920,0.94590,0.01606,...,0.01361,0.93522,0.04925,0.93159,0.08168,0.94066,-0.00035,0.91483,0.04712,g
348,1,0,0.94701,-0.00034,0.93207,-0.03227,0.95177,-0.03431,0.95584,0.02446,...,0.03193,0.92489,0.02542,0.92120,0.02242,0.92459,0.00442,0.92697,-0.00577,g
349,1,0,0.90608,-0.01657,0.98122,-0.01989,0.95691,-0.03646,0.85746,0.00110,...,-0.02099,0.89147,-0.07760,0.82983,-0.17238,0.96022,-0.03757,0.87403,-0.16243,g


### Data Preprocessing

In [13]:
from sklearn.utils import shuffle

# data shuffle
data_df = shuffle(data_df)

### Support Vector Machine - Linear Kernel

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from prettytable import PrettyTable

# Do Holdout validation with ratio 7:3
training_df = data_df.iloc[:246, :]
testing_df = data_df.iloc[246:, :]

linear_training_sample_df = training_df.drop(34, 1)
linear_training_target_df = training_df[34]

# model training
clf = SVC(kernel='linear')
clf.fit(linear_training_sample_df, linear_training_target_df)

# test data preparation
linear_testing_sample_df = testing_df.drop(34, 1)
linear_testing_target_df = testing_df[34]

# result output
predict_list = clf.predict(linear_testing_sample_df)

linear_confusion_matrix = confusion_matrix(linear_testing_target_df, predict_list)
linear_accuracy = accuracy_score(linear_testing_target_df, predict_list)
linear_recall = recall_score(linear_testing_target_df, predict_list, average='macro')
linear_precision = precision_score(linear_testing_target_df, predict_list, average='macro')

# print table
x = PrettyTable()

x.field_names = ["Confusion Matrix", "Accuracy", "Recall", "Precision"]
x.add_row([linear_confusion_matrix, linear_accuracy, linear_recall, linear_precision])
print(x)

+------------------+--------------------+--------------------+--------------------+
| Confusion Matrix |      Accuracy      |       Recall       |     Precision      |
+------------------+--------------------+--------------------+--------------------+
|     [[25 14]     | 0.8380952380952381 | 0.7977855477855478 | 0.8555194805194806 |
|     [ 3 63]]     |                    |                    |                    |
+------------------+--------------------+--------------------+--------------------+


### Support Vector Machine - Polynomial Kernel

In [17]:
from sklearn.model_selection import KFold

# Get sample and target
poly_training_sample_df = training_df.drop(34, 1)
poly_training_target_df = training_df[34]
poly_testing_sample_df = testing_df.drop(34, 1)
poly_testing_target_df = testing_df[34]

# define parameter set for grid search
degree_set = [3, 6, 12]
gamma_set = [0.001, 0.01, 0.1]
coef0_set = [0.5, 5, 50]

parameter_permutation = {}
for d in degree_set:
    for g in gamma_set:
        for c in coef0_set:
            title = "degree: " + str(d) + ", gamma: " + str(g) + ", coef0: " + str(c)
            parameter_permutation[title] = [d, g, c]

# Do grid search to find the best parameter combination
# I judge by the accuracy
# parameter_list: [degree, gamma, coef0]

def Poly_K_fold_average_score(df, parameter_list):
    kf = KFold(n_splits = 5)
    
    accuracy_list = []
    
    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        training_sample = train_df.drop(34, 1)
        training_target = train_df[34]
        
        test_df = df.iloc[test_index]
        testing_sample = test_df.drop(34, 1)
        testing_target = test_df[34]
        
        clf = SVC(kernel='poly', degree = parameter_list[0], gamma = parameter_list[1], coef0 = parameter_list[2])
        clf.fit(training_sample, training_target)
        
        predict = clf.predict(testing_sample)
        accuracy_list.append(accuracy_score(testing_target, predict))
    
    return (sum(accuracy_list) / 5)


parameter_score = {}
for key in parameter_permutation:
    parameter_list = parameter_permutation[key]
    parameter_score[key] = Poly_K_fold_average_score(training_df, parameter_list)

# find the maximum score of parameter combination
max_score = -1
max_key = ""
for key in parameter_score:
    if parameter_score[key] > max_score:
        max_score = parameter_score[key]
        max_key = key

# model training
clf = SVC(kernel='poly', degree = parameter_permutation[max_key][0], gamma = parameter_permutation[max_key][1], coef0 =  parameter_permutation[max_key][2])
clf.fit(poly_training_sample_df, poly_training_target_df)

# result output
predict_list = clf.predict(poly_testing_sample_df)

poly_confusion_matrix = confusion_matrix(poly_testing_target_df, predict_list)
poly_accuracy = accuracy_score(poly_testing_target_df, predict_list)
poly_recall = recall_score(poly_testing_target_df, predict_list, average='macro')
poly_precision = precision_score(poly_testing_target_df, predict_list, average='macro')

# print table
x = PrettyTable()

x.field_names = ["Confusion Matrix", "Accuracy", "Recall", "Precision"]
x.add_row([poly_confusion_matrix, poly_accuracy, poly_recall, poly_precision])
print(x)

# Question
x = PrettyTable()
x.field_names = ["Parameter", "5-Fold Average Accuracy"]
for key in parameter_score:
    x.add_row([key, parameter_score[key]])
print(x)

+------------------+--------------------+-------------------+--------------------+
| Confusion Matrix |      Accuracy      |       Recall      |     Precision      |
+------------------+--------------------+-------------------+--------------------+
|     [[28 11]     | 0.8952380952380953 | 0.858974358974359 | 0.9285714285714286 |
|     [ 0 66]]     |                    |                   |                    |
+------------------+--------------------+-------------------+--------------------+
+--------------------------------------+-------------------------+
|              Parameter               | 5-Fold Average Accuracy |
+--------------------------------------+-------------------------+
| degree: 3, gamma: 0.001, coef0: 0.5  |    0.6462040816326531   |
|  degree: 3, gamma: 0.001, coef0: 5   |    0.8779591836734694   |
|  degree: 3, gamma: 0.001, coef0: 50  |    0.8781224489795918   |
|  degree: 3, gamma: 0.01, coef0: 0.5  |    0.7599999999999999   |
|   degree: 3, gamma: 0.01, coef0

### Support Vector Machine - Polynomial Kernel

In [19]:
# Get sample and target
RBF_training_sample_df = training_df.drop(34, 1)
RBF_training_target_df = training_df[34]
RBF_testing_sample_df = testing_df.drop(34, 1)
RBF_testing_target_df = testing_df[34]

# define parameter set for grid search
gamma_set = [0.001, 0.01, 0.1]

parameter_list = {}

for g in gamma_set:
    title = "gamma: " + str(g)
    parameter_list[title] = g
    
# Do grid search to find the best parameter combination
# I judge by the accuracy
# parameter: gamma

def RBF_K_fold_average_score(df, parameter):
    kf = KFold(n_splits = 5)
    
    accuracy_list = []
    
    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        training_sample = train_df.drop(34, 1)
        training_target = train_df[34]
        
        test_df = df.iloc[test_index]
        testing_sample = test_df.drop(34, 1)
        testing_target = test_df[34]
        
        clf = SVC(kernel='rbf', gamma = parameter)
        clf.fit(training_sample, training_target)
        
        predict = clf.predict(testing_sample)
        accuracy_list.append(accuracy_score(testing_target, predict))
    
    return (sum(accuracy_list) / 5)

parameter_score = {}
for key in parameter_list:
    parameter = parameter_list[key]
    parameter_score[key] = RBF_K_fold_average_score(training_df, parameter)

# find the maximum score of parameter 
max_score = -1
max_key = ""
for key in parameter_score:
    if parameter_score[key] > max_score:
        max_score = parameter_score[key]
        max_key = key

# model training
clf = SVC(kernel='rbf', gamma = parameter_list[max_key])
clf.fit(RBF_training_sample_df, RBF_training_target_df)

# result output
predict_list = clf.predict(RBF_testing_sample_df)

RBF_confusion_matrix = confusion_matrix(RBF_testing_target_df, predict_list)
RBF_accuracy = accuracy_score(RBF_testing_target_df, predict_list)
RBF_recall = recall_score(RBF_testing_target_df, predict_list, average='macro')
RBF_precision = precision_score(RBF_testing_target_df, predict_list, average='macro')

# print table
x = PrettyTable()

x.field_names = ["Confusion Matrix", "Accuracy", "Recall", "Precision"]
x.add_row([RBF_confusion_matrix, RBF_accuracy, RBF_recall, RBF_precision])
print(x)

# Question
x = PrettyTable()
x.field_names = ["Parameter", "5-Fold Average Accuracy"]
for key in parameter_score:
    x.add_row([key, parameter_score[key]])
print(x)

+------------------+--------------------+--------------------+-------------------+
| Confusion Matrix |      Accuracy      |       Recall       |     Precision     |
+------------------+--------------------+--------------------+-------------------+
|     [[32  7]     | 0.9333333333333333 | 0.9102564102564102 | 0.952054794520548 |
|     [ 0 66]]     |                    |                    |                   |
+------------------+--------------------+--------------------+-------------------+
+--------------+-------------------------+
|  Parameter   | 5-Fold Average Accuracy |
+--------------+-------------------------+
| gamma: 0.001 |    0.6462040816326531   |
| gamma: 0.01  |    0.8496326530612244   |
|  gamma: 0.1  |    0.9348571428571428   |
+--------------+-------------------------+
