"""
Name - Matrikelnummer 
1) Pham, Ngoc Anh Trung - 7176267
2) Viktor Vironski - 4330455
3) Andy Disser - 5984875

Exercise Sheet 5
"""

In [30]:
from random import random
from sklearn import cluster, datasets
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import copy
import matplotlib.pyplot as plt
import matplotlib as mpl

In [31]:
from sklearn.svm import SVC

# Exercise 1

In [32]:
def softmargin(data, labels, supportvec):
    """
    Compute the soft margin score

    Inputs:
    - data: the given data
    - labels: the given labels
    - supportvec: the given indices of support vectors

    Output:
    - the soft margin score ( # misclassification * (-margin) )
    """

    # Left and right support vectors
    left = data[supportvec[0]]
    right = data[supportvec[1]]

    # Compute threshold
    th = np.abs(np.divide(left+right,2))

    # Compute margin
    margin = np.abs(left-th)

    # Miss classification for label 1 when they are on the right side of the threshold
    missclassification1 = np.logical_and((data-th > 0), (labels == 0))
    # Miss classification for label 2 when they are on the left side of the threshold
    missclassification2 = np.logical_and((data-th < 0), (np.logical_or(labels == 1,\
         labels == 2)))

    # Miss classified when only one of the two criterion above is satisfied
    missclassification = np.logical_xor(missclassification1, missclassification2)

    # The number of missclassified points
    num_of_missclass = np.size(data[missclassification])

    # Weight each of the miss classified with negative margin and
    # return the sum as soft margin score
    return np.multiply(num_of_missclass, (-margin))

In [33]:
# Import and access the data
iris = datasets.load_iris()
iris_data = iris['data']

n = iris_data.shape[0]

# Extract the petal widths
data = iris_data[:, 3]

# Create permutation of 0,1,..,149
permutation = np.random.permutation(np.arange(150))
train_filter = permutation < 100
test_filter = permutation >= 100

# Split the data, 100 samples for training, 50 samples for testing
train_data = data[train_filter]
test_data = data[test_filter]

# The respective labels
train_target = iris.target[train_filter]
test_target = iris.target[test_filter]

# ============= 1b =================================

# Repeat 20 times
i = 0
best_margin_score = -np.inf
best_supp_vec_indices = np.zeros(2)
# The indices of the train data
indices = np.arange(100)
while i < 20:

    # Pick 2 random point, one from 1 class and the other from other class
    supp_vec_left = np.random.choice(indices[train_target == 0], 1, replace=False)
    supp_vec_right = np.random.choice(indices[np.logical_or(train_target == 1, train_target == 2)], 1, replace=False)

    supp_vec_indices = np.array([supp_vec_left, supp_vec_right])

    # Compute score
    score = softmargin(train_data, train_target, supp_vec_indices)

    # Compare score
    if score > best_margin_score:
        best_margin_score = score
        best_supp_vec_indices = supp_vec_indices

    i += 1

# ============ 1b Test the data ========================================

# Compute threshold
th = np.divide(train_data[best_supp_vec_indices[0]]+train_data[best_supp_vec_indices[1]], 2)

# filter the positions that the algorithm predicted the point belongs to class/label 1
predicted_label1 = th-test_data > 0

# filter the positions that the algorithm predicted the point belongs to class/label 2
predicted_label2 = th-test_data <= 0

truth_label1 = test_target == 0
truth_label2 = np.logical_or(test_target == 1, test_target == 2)

correctly_classified_1 = np.logical_and(predicted_label1, truth_label1)
correctly_classified_2 = np.logical_and(predicted_label2, truth_label2)

correct_classified_test_data = test_data[np.logical_or(correctly_classified_1, correctly_classified_2)]

print("Accuracy:", np.divide(np.size(correct_classified_test_data), np.size(test_data)))

Accuracy: 1.0


# Exercise 2

In [4]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data

In [5]:
X, y = datasets.load_iris(return_X_y=True)

# Split the dataset

In [9]:
# numbers from 1 to 150 randomly shuffled
random_inidices = np.random.choice(150,size=150, replace=False)

In [10]:
# assign the first 105 random indicies for training and the rest for testing
training_inices, test_indices = random_inidices[:105], random_inidices[105:]

In [12]:
# get the training data from the randomly picked indicies
training_data = np.zeros((105,4))
for num, index in enumerate(training_inices):
    training_data[num,:] = data[index]

In [13]:
# get the training targets from the randomly picked indicies
training_target = np.zeros((105))
for num, index in enumerate(training_inices):
    training_target[num] = iris.target[index]

In [19]:
# get the test data from the randomly picked inidicies
test_data = np.zeros((45,4))
for num, index in enumerate(test_indices):
    test_data[num,:] = data[index]

In [21]:
# get the test targets from the randomly picked inidicies
test_target = np.zeros(45)
for num, index in enumerate(test_indices):
    test_target[num] = iris.target[index]

# 5-fold cross-validation

In [14]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.01, 0.1, 1, 10], 'kernel': ['rbf']}

In [18]:
accuracy_matrix = np.zeros((len(param_grid["C"]),len(param_grid["gamma"])))

# iterate over all C-gamma combinations and all folds
for c_index, c in enumerate(param_grid["C"]):  
    for g_index, gamma in enumerate(param_grid["gamma"]):
        
        # array for the accuracies in fold i
        acc_folds = np.zeros(5)
        
        # split the data into 5 subarrays
        fold_data_arrays = np.split(training_data, 5)
        fold_target_arrays = np.split(training_target, 5)
        
        for fold_i in range(5):
            
            #copy folds array to make no changes at the original list of arrays
            fold_i_data = fold_data_arrays.copy()
            fold_i_target = fold_target_arrays.copy()
            
            #test data for the i-th fold
            fold_i_test_d = fold_i_data.pop(fold_i)
            fold_i_test_t = fold_i_target.pop(fold_i)
            
            # training data for the i-th fold
            fold_i_training_d = np.concatenate(fold_i_data)
            fold_i_training_t = np.concatenate(fold_i_target)
            
            # calculate the pobability
            acc = SVC(C=c, gamma=gamma)
            acc.fit(fold_i_training_d, fold_i_training_t)
            acc_folds[fold_i] = acc.score(fold_i_test_d, fold_i_test_t)
        
        # calculate the average accuracy
        accuracy_matrix[c_index, g_index] = np.mean(acc_folds)

# best values for C and gamma
best_C = param_grid["C"][np.argmax(accuracy_matrix)//4]
best_gamma = param_grid["gamma"][np.argmax(accuracy_matrix)%4]

print(f"The accuracy of the C-gamma combinations is shown in the following matrix (C,gamma):\n\n\
{accuracy_matrix}")
print()
print(f"The highest accuracy is {np.amax(accuracy_matrix)}. \n\
It was achieved with (C,gamma) = {(best_C,best_gamma)}")

The accuracy of the C-gamma combinations is shown in the following matrix (C,gamma):

[[0.22857143 0.22857143 0.22857143 0.22857143]
 [0.22857143 0.22857143 0.22857143 0.22857143]
 [0.37142857 0.80952381 0.93333333 0.24761905]
 [0.8952381  0.95238095 0.95238095 0.92380952]
 [0.96190476 0.98095238 0.94285714 0.92380952]]

The highest accuracy is 0.980952380952381. 
It was achieved with (C,gamma) = (10, 0.1)


# train the whole trainig data with the optimal parameters from cross-validation

In [26]:
final_model = SVC(C=best_C,gamma=best_gamma, kernel='rbf')

In [27]:
final_model.fit(training_data, training_target)

SVC(C=10, gamma=0.1)

In [28]:
final_model.score(test_data, test_target)

0.9777777777777777