# Pre-process
## Download thundersvm (Colab only: cuda9.0 is needed)
If you are running the code in colab, you can install thundersvm to fasten the training process. Even if kaggle also provide GPU, CUDA 9.0 cannot be successfully installed because it needs more than another two commands to proceed when downloading in kaggle.(For only one command, it can be proceeded by adding '-[needed command]', but there seems to be no way for more than one.)

If you are running the code on kaggle, please set the flag 'use_thunder' to False. 
Note: The uploaded cvs file for mnist is produced by thundersvm.

In [4]:
use_thunder = False

In [None]:
if use_thunder:
    !wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
    !dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
    !apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
    !apt-get update
    !apt-get install cuda=9.0.176-1
    !pip  install thundersvm

## Import all packages

In [5]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm, metrics
from scipy import io
import cv2
from skimage import feature as skif
from skimage.feature import hog

if use_thunder:
    import thundersvm

if sys.version_info[0] < 3:
    raise Exception("Python 3 not detected.")

## Define commonly used functions 

In [2]:
# Usage: results_to_csv(clf.predict(data_test))
def results_to_csv(y_test, data_name):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv(data_name + '_submission.csv', index_label='Id')
    
def do_standardize(test_data, train_data):
    test_data = test_data.reshape(len(test_data), -1)
    train_data = train_data.reshape(len(train_data), -1)
    index = len(test_data)
    print(index)
    data = np.append(test_data, train_data, axis=0)
    data = (data - np.min(data))/(np.max(data) - np.min(data))
    return data[0: index, :], data[index::, :]

## Load data

In [6]:
# Please modify the data path to yours
data_dict = {}
for data_name in ["mnist", "spam", "cifar10"]:
    data_dict[data_name] = np.load(f"/kaggle/input/hw1-dataset/{data_name}-data.npz")
    print("loaded %s data!" % data_name)

loaded mnist data!
loaded spam data!
loaded cifar10 data!


# Q2. Data partition

In [7]:
# partition dataset according to its type, return train set & val set
def do_partition(data_name, data, label):
#     data = data.reshape(len(data), -1)
    label = label.reshape(len(label), 1)
    dataset = np.concatenate([data, label], axis=1)
    np.random.seed()
    np.random.shuffle(dataset)
    if data_name == 'mnist':
        index = 1000
#         index = 10000
    elif data_name == 'spam':
        index = int(len(dataset) * 0.2)
    else:
        index = 5000
    val_set = dataset[0:index]
    train_set = dataset[index:-1]
    return train_set, val_set

# Q3. Support Vector Machines

In [None]:
for data_name in ["mnist", "spam", "cifar10"]:
    data = data_dict[data_name]
    fields = "test_data", "training_data", "training_labels"
    for field in fields:
        print(field, data[field].shape)
    test_data, train_data = do_standardize(data['test_data'], data['training_data'])
    train_set, val_set = do_partition(data_name, train_data, data['training_labels'])
    train_x = train_set[:, :-1]
    train_y = train_set[:, -1]
    val_x = val_set[:, :-1]
    val_y = val_set[:, -1]
    if data_name == "mnist":
        train_num_list = [100, 200, 500, 1000, 2000, 5000, 10000]
    elif data_name == "spam":
        train_num_list = [100, 200, 500, 1000, 2000, len(train_y)]
    else:
        train_num_list = [100, 200, 500, 1000, 2000, 5000]
    model = svm.LinearSVC(dual=False)  # cancel dual to avoid 'ConvergenceWarning'
    print("***********************************************")
    print("Training model for {}".format(data_name))
    train_acc_list = []
    val_acc_list = []
    for train_num in train_num_list:
        print("Using {} training examples:".format(train_num))
        model.fit(train_x[0:train_num, :], train_y[0:train_num])
        train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
        val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
        print("\tTrain acc: {}".format(train_acc))
        print("\tVal acc: {}".format(val_acc))
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)
    plt.figure()
    plt.title('Train & val acc for {}'.format(data_name))
    plt.plot(train_num_list, train_acc_list, 'bo', label='Training acc')
    plt.plot(train_num_list, val_acc_list, 'b', label='Validation acc')
    plt.legend()
    plt.savefig(data_name+'.png', bbox_inches='tight')
    plt.show()


# Q4. Hyperparameter Tuning

In [None]:
data_name = "mnist"
data = data_dict[data_name]
print("\n*********************************************************")
print("\nloaded %s data!" % data_name)
test_data, train_data = do_standardize(data['test_data'], data['training_data'])
train_set, val_set = do_partition(data_name, train_data, data['training_labels'])
train_x = train_set[:, :-1]
train_y = train_set[:, -1]
val_x = val_set[:, :-1]
val_y = val_set[:, -1]

print("Training model for {}".format(data_name))
train_acc_list = []
val_acc_list = []
train_num = 10000
c_exp_list = range(-5, 5)
# c_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
for c_pow in c_exp_list:
    c = pow(10, c_pow)
    model = svm.LinearSVC(dual=False, C=c)  # cancel dual to avoid 'ConvergenceWarning'
    print("Regularization param set to {} :".format(c))
    model.fit(train_x[0:train_num, :], train_y[0:train_num])
    train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
    val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
    print("\tTrain acc: {}".format(train_acc))
    print("\tVal acc: {}".format(val_acc))
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)
plt.figure()
plt.title('Train & val acc for {}'.format(data_name))
plt.plot(c_exp_list, train_acc_list, 'bo', label='Training acc')
plt.plot(c_exp_list, val_acc_list, 'b', label='Validation acc')
plt.xlabel('Exp for c (base:10)')
plt.ylabel('Acc')
plt.legend()
plt.savefig(data_name+'_param_tune.png', bbox_inches='tight')
plt.show()

# Q5. K-fold

In [None]:
# partition dataset according to its type, return train set & val set
def make_dataset(data, label):
    data = data.reshape(len(data), -1)
    label = label.reshape(len(label), 1)
    dataset = np.concatenate([data, label], axis=-1)
    np.random.seed()
    np.random.shuffle(dataset)
    return dataset

In [None]:
data_name = "spam"
data = data_dict[data_name]
print("\n*********************************************************")
print("\nloaded %s data!" % data_name)
dataset = make_dataset(data['training_data'], data['training_labels'])
# change k to implement k-fold
k = 5
l = int(len(dataset) / k)
print("Training model for {}".format(data_name))

train_acc_list = []
val_acc_list = []
train_num = 10000
c_exp_list = range(-7, 0)
for c_pow in c_exp_list:
    c = pow(10, c_pow)
    print("Regularization param set to {} :".format(c))
    tmp_train_acc_list = []
    tmp_val_acc_list = []
    for i in range(k):
        print("\tRound {}".format(i+1))
        val_set = dataset[i*l: (i+1)*l, :]
        train_set = np.append(dataset[0:i * l, :], dataset[ (i+1)*l:-1, :], axis=0)
        train_x = train_set[:, :-1]
        train_y = train_set[:, -1]
        val_x = val_set[:, :-1]
        val_y = val_set[:, -1]

        model = svm.LinearSVC(dual=False, C=c)  # cancel dual to avoid 'ConvergenceWarning'
        model.fit(train_x[0:train_num, :], train_y[0:train_num])
        train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
        val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
        print("\t\tTrain acc: {}".format(train_acc))
        print("\t\tVal acc: {}".format(val_acc))
        tmp_train_acc_list.append(train_acc)
        tmp_val_acc_list.append(val_acc)
    avg_train_acc = np.average(tmp_train_acc_list)
    avg_val_acc = np.average(tmp_val_acc_list)
    train_acc_list.append(avg_train_acc)
    val_acc_list.append(avg_val_acc)
    print("\tAverage train acc: {}".format(avg_train_acc))
    print("\tAverage val acc: {}".format(avg_val_acc))
plt.figure()
plt.title('Train & val acc for {}'.format(data_name))
plt.plot(c_exp_list, train_acc_list, 'bo', label='Training acc')
plt.plot(c_exp_list, val_acc_list, 'b', label='Validation acc')
plt.xlabel('Exp for c (base:10)')
plt.ylabel('Acc')
plt.legend()
plt.savefig(data_name + '_' + str(k) + 'fold.png', bbox_inches='tight')
plt.show()

# Q6. Kaggle
## 6.0 Feature extraction

In [8]:
def extract_color_features(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    # Split the channels - h,s,v
    h,s,v = cv2.split(hsv)
    color_feature = []
    # The first central moment - average 
    h_mean = np.mean(h) 
    s_mean = np.mean(s) 
    v_mean = np.mean(v) 
    color_feature.extend([h_mean, s_mean, v_mean])
    # The second central moment - standard deviation
    h_std = np.std(h)  # np.sqrt(np.mean(abs(h - h.mean())**2))
    s_std = np.std(s)  # np.sqrt(np.mean(abs(s - s.mean())**2))
    v_std = np.std(v)  # np.sqrt(np.mean(abs(v - v.mean())**2))
    color_feature.extend([h_std, s_std, v_std])
    # The third central moment - the third root of the skewness
    h_skewness = np.mean(abs(h - h.mean())**3)
    s_skewness = np.mean(abs(s - s.mean())**3)
    v_skewness = np.mean(abs(v - v.mean())**3)
    h_thirdMoment = h_skewness**(1./3)
    s_thirdMoment = s_skewness**(1./3)
    v_thirdMoment = v_skewness**(1./3)
    color_feature.extend([h_thirdMoment, s_thirdMoment, v_thirdMoment])
#     print(color_feature)
    return [x/200 for x in color_feature] # div 200 to let model easier to converge

def extract_lbp_features(img, lbp_radius=1, lbp_point=8):
    lbp = skif.local_binary_pattern(img, lbp_point, lbp_radius, 'default')
    max_bins = int(lbp.max() + 1)
    # hist size:256
    hist, _ = np.histogram(lbp, density=True, bins=max_bins, range=(0, max_bins))
    return hist

def extract_hog_features(img):
    return hog(img)


def extract_features(data, width=32, height=32, is_rgb=True):
    image_descriptors = []
    arr = np.array(data)
    print(arr.shape)
    for x in data:
        if is_rgb:
            x = x.reshape(3, width, height).T
            fd = extract_color_features(x)
            x = cv2.cvtColor(x, cv2.COLOR_BGR2GRAY)
        else:
            x = x.reshape(width, height).T
            fd = []
        fd = np.append(fd, extract_hog_features(x))
        fd = np.append(fd, extract_lbp_features(x))
        image_descriptors.append(fd)
    return image_descriptors

## 6.1 Mnist

In [9]:
data_name = "mnist"
data = data_dict[data_name]
print("\nloaded %s data!" % data_name)
if use_thunder:
    test_data, train_data = do_standardize(data["test_data"], data['training_data'])
else:
    test_data = extract_features(data["test_data"], 28, 28, False)
    train_data = extract_features(data['training_data'], 28, 28, False)
# train_set, val_set = do_partition(data_name, train_data, data['training_labels'])
# train_x = train_set[:, :-1]
# train_y = train_set[:, -1]
# val_x = val_set[:, :-1]
# val_y = val_set[:, -1]
train_x = np.array(train_data).reshape(len(train_data), -1)
train_y = data['training_labels']

print("Training model for {}".format(data_name))
train_acc_list = []
val_acc_list = []
train_num = len(train_data)
c_list = [7.5]
for c in c_list:
    if use_thunder:
        model = thundersvm.SVC(C=c, gamma=0.03)  # best performance, 0.98233
    else:
        model = svm.SVC(C=c, kernel="poly", degree=3) # 0.97933
#         model = svm.LinearSVC(C=c)
    print("Regularization param {}:".format(c))
    model.fit(train_x[0:train_num, :], train_y[0:train_num])
    train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
#     val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
    print("\tTrain acc: {}".format(train_acc))
#     print("\tVal acc: {}".format(val_acc))
    train_acc_list.append(train_acc)
#     val_acc_list.append(val_acc)
    results_to_csv(model.predict(test_data), data_name + "_" + str(c))


loaded mnist data!
(10000, 1, 28, 28)
(60000, 1, 28, 28)
Training model for mnist
Regularization param 7.5:
	Train acc: 0.9935666666666667


# Train model for spam

In [None]:
data_dict["spam"] = np.load(f"/kaggle/input/spam-data-v2/spam-data-2.npz")

In [None]:
data_name = "spam"
data = data_dict[data_name]
print("\n*********************************************************")
print("\nloaded %s data!" % data_name)
test_data = data["test_data"]
train_data = data['training_data']
test_data, train_data = do_standardize(test_data, train_data)
train_set, val_set = do_partition(data_name, train_data, data['training_labels'])
train_x = train_set[:, :-1]
train_y = train_set[:, -1]
val_x = val_set[:, :-1]
val_y = val_set[:, -1]

print("Training model for {}".format(data_name))
train_acc_list = []
val_acc_list = []
train_num = len(train_data)
# c_list = [90000, 100000, 110000]
# gamma_list = [900, 1000, 2000]
c_list = [100000]
gamma_list = [900]
for c in c_list:
    for gamma in gamma_list:
        if use_thunder:
            model = thundersvm.SVC(C=c, gamma=gamma)
        else:
            model = svm.SVC(C=c, gamma=gamma)
        print("Regularization param {}, gamma {} :".format(c, gamma))
        model.fit(train_x[0:train_num, :], train_y[0:train_num])
        train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
        val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
        print("\tTrain acc: {}".format(train_acc))
        print("\tVal acc: {}".format(val_acc))
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)
        results_to_csv(model.predict(test_data), data_name + "_" + str(c) + "_" + str(gamma))


# Train model for cifar10
Use color features, LBP features, HOG features to train a linear classifer.

In [None]:
import sys
if sys.version_info[0] < 3:
    raise Exception("Python 3 not detected.")
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, metrics
from scipy import io

    
if __name__ == "__main__":
    data_name = "cifar10"
    data = np.load(f"/kaggle/input/hw1-dataset/{data_name}-data.npz")
    print("\n*********************************************************")
    print("\nloaded %s data!" % data_name)
    test_data = extract_features(data["test_data"])
    train_data = extract_features(data['training_data'])
    train_set, val_set = do_partition(data_name, train_data, data['training_labels'])

    train_x = train_set[:, :-1]
    train_y = train_set[:, -1]
    print("Train data:{}, Train label:{}".format(len(train_x), len(train_y)))
    val_x = val_set[:, :-1]
    val_y = val_set[:, -1]

    print("Training model for {}".format(data_name))
    train_acc_list = []
    val_acc_list = []
    train_num = len(data['training_data'])
    print(train_num)
#     c_list = [0.1, 1, 10, 100]
    c_list = [1]
    for c in c_list:
            model = svm.LinearSVC(C=c) 
            print("Regularization param {} :".format(c))
            model.fit(train_x, train_y)
            train_acc = metrics.accuracy_score(train_y, model.predict(train_x))
            val_acc = metrics.accuracy_score(val_y, model.predict(val_x))
            print("\tTrain acc: {}".format(train_acc))
            print("\tVal acc: {}".format(val_acc))
            train_acc_list.append(train_acc)
            val_acc_list.append(val_acc)
            results_to_csv(model.predict(test_data), "col_hog_lbp_" + data_name + "_" + str(c))