In [1]:
from svmutil import *
import csv
import math
import random
import subprocess
import os
import numpy as np

In [2]:
def read_from_file(path):
    data = []
    with open(path) as f:
        reader = csv.reader(f, delimiter=",")
        for row in reader:
            data.append([row[:-1], row[-1]])
    for column in range(len(data[0][0])):
        for row in data:
            row[0][column] = float(row[0][column].strip())
    return data


def write_data_for_scale(file_name, data):
    f = open(file_name, 'w')
    for row in data:
        line = str(row[-1])
        for i in xrange(len(row[0])):
                line += ' ' + str(i + 1) + ':' + str(row[0][i])
        line = line + '\n'
        f.write(line)
    f.close()


def get_standard_deviation(y_test, p_labels):
    errors = [y_test[i] - p_labels[i] for i in xrange(len(y_test))]
    std = np.std(errors)
    return std


def kfold_cross_validation(params, features, k, seed=None):
    partitions = partition(features, k, seed)
    errors = list()

    # Run the algorithm k times, record error each time
    for i in range(k):
        trainingSet = list()
        for j in range(k):
            if j != i:
                trainingSet.append(partitions[j])

        # flatten training set
        trainingSet = [item for entry in trainingSet for item in entry]
        testSet = partitions[i]
        y_train = [row[0] for row in trainingSet]
        x_train = [row[1] for row in trainingSet]
        # Train and classify model
        prob = svm_problem(y_train, x_train)
        trainedClassifier = svm_train(prob, params)
        y_test = [row[0] for row in testSet]
        x_test = [row[1] for row in testSet]
        p_labels, p_acc, p_vals = svm_predict(y_test, x_test, trainedClassifier)
        errors.append(p_acc[1])
    std = np.std(errors)
    mean = np.mean(errors)
    return mean, std
    #
    # # Compute statistics
    # mean = sum(errors) / k
    # variance = sum([(error - mean) ** 2 for error in errors]) / (k)
    # standardDeviation = variance ** .5
    # confidenceInterval = (mean - 1.96 * standardDeviation, mean + 1.96 * standardDeviation)

    # _output(
    #     "\t\tMean = {0:.2f} \n\t\tVariance = {1:.4f} \n\t\tStandard Devation = {2:.3f} \n\t\t95% Confidence interval: [{3:.2f}, {4:.2f}]" \
    #     .format(mean, variance, standardDeviation, confidenceInterval[0], confidenceInterval[1]))

    #return (errors, mean, variance, confidenceInterval, k)


def partition(dataSet, k, seed=None):
    size = math.ceil(len(dataSet) / float(k))
    partitions = [[] for i in range(k)]
    j = 0

    for entry in dataSet:
        x = assign(partitions, k, size, seed)
        partitions[x].append(entry)

    return partitions


def assign(partitions, k, size, seed=None):
    if seed is not None:
        random.Random(seed)
    x = random.randint(0,k-1)
    while len(partitions[x]) >= size:
        x = random.randint(0,k-1)
    return x


def scale_data(file_path):
    data = read_from_file(file_path)
    train_file_name = "train"
    test_file_name = "test"
    train = data[:3450]
    test = data[3450:]
    write_data_for_scale(train_file_name, train)
    write_data_for_scale(test_file_name, test)
    args = "svm-scale.exe -l 0 -u 1 -s scale_params " + train_file_name
    fnull = open(os.devnull, 'w')
    train_file = open(train_file_name + '.scaled', 'w')
    subprocess.call(args, stdout=train_file, stderr=fnull, shell=False)
    train_file.close()
    test_file = open(test_file_name + ".scaled", 'w')
    args = "svm-scale.exe -l 0 -u 1 -r scale_params " + test_file_name
    subprocess.call(args, stdout=test_file, stderr=fnull, shell=False)
    test_file.close()
    fnull.close()

In [None]:
scale_data("sample.txt")
y_train, x_train = svm_read_problem("train.scaled")
y_test, x_test = svm_read_problem("test.scaled")
train_data = [[y_train[i], x_train[i]] for i in xrange(len(x_train))]
k_fold_param = 10
d = 1
top = []
middle = []
bottom = []
for k in xrange(-20, 21):
    svm_params = svm_parameter('-s 0 -t 1 -d ' + str(d) + ' -c ' + str(2**k))
    error, std = kfold_cross_validation(svm_params, train_data, k_fold_param)
    top.append([error + std, 2**k])
    middle.append([error, 2**k])
    bottom.append([error - std, 2**k])