In [174]:
# сравнение методов SGD и Batch GD:
# SGD позволяет проводить online изменение значений параметров, т.е. на каждой итерации, после вычисления одного из параметров,
# он сразу участвует в вычислении других. Также SGD на каждой итерации требует только одного значения из выборки, что делает его
# гораздо более быстрым по сравнению с Batch GD
# минусом SGD является то, что выбирается случайный вектор из выборки и таким образом это может увеличить количество итераций,
# необходимых для того, чтобы метод сошелся.
# Batch GD имеет большую вычислительную сложность, т.к. на каждой итерации ему необходимо просчитывать значения отступов 
# всей выборки. Также, все значения параметров в Batch вычисляются независимо друг от друга.

In [8]:
import numpy as np
import csv
import random
import itertools
import operator

In [11]:
def read_from_file(path):
    data = []
    with open(path) as f:
        reader = csv.reader(f, delimiter=",")
        for row in reader:
            data.append([row[:-1], row[-1]])
    for column in range(len(data[0][0])):
        for row in data:
            row[0][column] = float(row[0][column].strip())
    return data


def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0][0])):
        col_values = [row[0][i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax


def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row[0])):
            row[0][i] = (row[0][i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
            
            
def generate_folds(data, k, models):
    classes = []
    for model in models:
        classes.append([elem for elem in data if elem[-1] == model])
    folds = []
    for i in xrange(k):
        fold = []
        for cl in classes:
            for i in xrange(len(cl) / k):
                elem = random.choice(cl)
                fold.append(elem)
        random.shuffle(fold)
        folds.append(fold)
    return folds


In [35]:
file_path = "irisdata.csv"
data = read_from_file(file_path)
minmax = dataset_minmax(data)
normalize_dataset(data, minmax)
k = 5
models = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
folds = generate_folds(data, k, models)
print folds
#нормированные данные разбитые на foldы

[[[[0.4999999999999999, 0.3333333333333332, 0.5084745762711864, 0.5], 'Iris-versicolor'], [[0.5833333333333334, 0.5, 0.5932203389830508, 0.5833333333333334], 'Iris-versicolor'], [[0.19444444444444448, 0.5833333333333333, 0.0847457627118644, 0.04166666666666667], 'Iris-setosa'], [[0.1666666666666668, 0.4583333333333333, 0.0847457627118644, 0.04166666666666667], 'Iris-setosa'], [[0.6666666666666666, 0.20833333333333331, 0.8135593220338982, 0.7083333333333334], 'Iris-virginica'], [[0.44444444444444453, 0.41666666666666663, 0.6949152542372881, 0.7083333333333334], 'Iris-virginica'], [[0.361111111111111, 0.3333333333333332, 0.6610169491525424, 0.7916666666666666], 'Iris-virginica'], [[0.5555555555555555, 0.20833333333333331, 0.6610169491525424, 0.5833333333333334], 'Iris-versicolor'], [[0.19444444444444448, 0.5, 0.033898305084745756, 0.04166666666666667], 'Iris-setosa'], [[0.38888888888888895, 0.7499999999999998, 0.11864406779661016, 0.08333333333333333], 'Iris-setosa'], [[0.666666666666666

In [47]:
def generate_actual(folds, model):
    new_folds = []
    for fold in folds:
        new_fold = []
        for row in fold:
            new_row = list(row)
            new_row[-1] = 1 if row[-1] == model else 0
            new_fold.append(new_row)
        new_folds.append(new_fold)
    return new_folds


def get_actual_data(data, model):
    new_data = []
    for row in data:
        new_row = list(row)
        new_row[-1] = 1 if row[-1] == model else 0
        new_data.append(new_row)
    return new_data


def get_training_and_test_set(data):
    np.random.shuffle(data)
    count_parts = 10
    part_len = len(data) / count_parts
    training_set = data[:9 * part_len]
    test_set = data[-part_len:]
    return training_set, test_set


def get_cross_validation_split(data_set, count_folds):
    training_set_split = []
    training_set_copy = list(data_set)
    fold_size = int(len(data_set) / count_folds)
    for i in xrange(count_folds):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(len(training_set_copy))
            fold.append(training_set_copy.pop(index))
        training_set_split.append(fold)
    return training_set_split


def get_accuracy(actual, predicted):
    correct = 0
    for i in xrange(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100


def get_best_params(folds, model, algorithm, l_rates, reg_coefs, iter_counts, is_sgd):
    model_folds = generate_actual(folds, model)
    params = itertools.product(l_rates, reg_coefs, iter_counts)
    best_scores = []
    for params_set in params:
        scores = []
        for fold in model_folds:
            training_set = list(model_folds)
            training_set.remove(fold)
            training_set = sum(training_set, [])
            predicted = algorithm(training_set, fold, params_set[0], params_set[1], params_set[2], is_sgd)
            actual = [row[-1] for row in fold]
            accuracy = get_accuracy(actual, predicted)
            scores.append([accuracy, params_set])
        max_score = max(scores, key=operator.itemgetter(0))
        best_scores += [score for score in scores if score[0] == max_score[0]]
    max_best = max(best_scores, key=operator.itemgetter(0))
    return [score for score in best_scores if score[0] == max_best[0]]


def predict(row, coefficients):
    yhat = coefficients[0]
    for i in xrange(len(row[0])):
        yhat += coefficients[i + 1] * row[0][i]
    if yhat > 300:
        return 0
    if yhat < -300:
        return 1
    return 1.0 / (1.0 + np.exp(-yhat))


def get_batch_coefficients(training_set, l_rate, reg_coef, iteration_count):
    batch_coefs = [0.0 for i in xrange(len(training_set[0][0]) + 1)]
    m = len(training_set)
    for iter in xrange(iteration_count):
        error = [predict(row, batch_coefs) - row[-1] for row in training_set]
        batch_coefs[0] = batch_coefs[0] - l_rate * (
            sum(error) / m + 2 * reg_coef * batch_coefs[0])
        for i in xrange(len(training_set[0][0])):
            batch_coefs[i + 1] = batch_coefs[i + 1] - l_rate * (
                sum(error[i] * row[0][i] for row in training_set) / m + 2 * reg_coef * batch_coefs[i + 1])
    return batch_coefs


def get_sgd_coefficients(training_set, l_rate, reg_coef, iteration_count):
    sgd_coefs = [0.0 for i in xrange(len(training_set[0][0]) + 1)]
    for iter in xrange(iteration_count):
        z = random.choice(training_set)
        sgd_coefs[0] = sgd_coefs[0] - l_rate * (
            predict(z, sgd_coefs) - z[-1] + 2 * reg_coef * sgd_coefs[0])
        for i in xrange(len(z[0])):
            sgd_coefs[i + 1] = sgd_coefs[i + 1] - l_rate * (
                (predict(z, sgd_coefs) - z[-1]) * z[0][i] + 2 * reg_coef *
                sgd_coefs[i + 1])
    return sgd_coefs


def logistic_regression(training_set, test_set, l_rate, reg_coef, count_iterations, is_sgd):
    predictions = []
    if (is_sgd):
        coefs = get_sgd_coefficients(training_set, l_rate, reg_coef, count_iterations)
    else:
        coefs = get_batch_coefficients(training_set, l_rate, reg_coef, count_iterations)
    for row in test_set:
        yhat = predict(row, coefs)
        yhat = round(yhat)
        predictions.append(yhat)
    return predictions

In [87]:
l_rates = [0.001 * 10 ** i for i in xrange(4)]
l_rates += [0.003 * 10 ** i for i in xrange(3)]
reg_coefs = [0.1 * (i + 1) for i in xrange(10)]
iter_counts = [100 * (i + 1) for i in xrange(10)]

In [None]:
#SGD
is_sgd = True

In [43]:
# первая модель
best = get_best_params(folds, models[0], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best
# список параметров, для которых k-fold отработал наилучшим образом

[[100.0, (0.1, 0.1, 100)], [100.0, (0.1, 0.1, 200)], [100.0, (0.1, 0.1, 300)], [100.0, (0.1, 0.1, 300)], [100.0, (0.1, 0.1, 400)], [100.0, (0.1, 0.1, 400)], [100.0, (0.1, 0.1, 600)], [100.0, (0.1, 0.1, 800)], [100.0, (0.1, 0.1, 800)], [100.0, (0.1, 0.1, 900)], [100.0, (0.1, 0.1, 1000)], [100.0, (0.1, 0.1, 1000)], [100.0, (0.1, 0.2, 800)], [100.0, (0.1, 0.2, 900)], [100.0, (0.1, 0.2, 900)], [100.0, (0.1, 0.30000000000000004, 200)], [100.0, (0.1, 0.30000000000000004, 600)], [100.0, (0.1, 0.30000000000000004, 700)], [100.0, (0.1, 0.30000000000000004, 800)], [100.0, (0.1, 0.4, 100)], [100.0, (0.1, 0.4, 200)], [100.0, (0.1, 0.4, 600)], [100.0, (0.1, 0.4, 700)], [100.0, (0.1, 0.4, 800)], [100.0, (0.1, 0.4, 900)], [100.0, (0.1, 0.4, 1000)], [100.0, (0.1, 0.5, 100)], [100.0, (0.1, 0.5, 300)], [100.0, (0.1, 0.5, 600)], [100.0, (0.1, 0.5, 700)], [100.0, (0.1, 0.5, 700)], [100.0, (0.1, 0.5, 800)], [100.0, (0.1, 0.6000000000000001, 600)], [100.0, (0.1, 0.6000000000000001, 600)], [100.0, (0.1, 0.70

In [50]:
def get_training_and_test_set(data):
    np.random.shuffle(data)
    count_parts = 10
    part_len = len(data) / count_parts
    training_set = data[:9 * part_len]
    test_set = data[-part_len:]
    return training_set, test_set

In [55]:
# выберем параметры модели 0.1 0.1 300, т.к. они являются оптимальными сразу для нескольких foldов.
actual_data = get_actual_data(data, models[0])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 0.1, 0.1, 300, is_sgd)
print get_accuracy(actual, predictions)

100.0


In [56]:
# получение оптимальных параметров для второй модели
best = get_best_params(folds, models[1], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best

[[90.0, (0.03, 0.8, 900)]]


In [59]:
# тестирование второй модели
actual_data = get_actual_data(data, models[1])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 0.03, 0.8, 900, is_sgd)
print get_accuracy(actual, predictions)

86.6666666667


In [60]:
# получение оптимальных параметров для третьей модели
best = get_best_params(folds, models[2], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best

[[100.0, (0.01, 0.1, 100)], [100.0, (0.01, 0.2, 500)], [100.0, (0.01, 0.2, 1000)], [100.0, (0.01, 0.4, 600)], [100.0, (0.01, 0.5, 300)], [100.0, (0.01, 0.5, 500)], [100.0, (0.01, 0.7000000000000001, 200)], [100.0, (0.01, 0.8, 600)], [100.0, (0.01, 1.0, 800)], [100.0, (0.01, 1.0, 900)], [100.0, (0.01, 1.0, 1000)], [100.0, (0.1, 0.1, 300)], [100.0, (0.1, 0.1, 500)], [100.0, (0.1, 0.2, 100)], [100.0, (0.1, 0.2, 300)], [100.0, (0.1, 0.2, 1000)], [100.0, (0.1, 0.4, 100)], [100.0, (0.1, 0.4, 1000)], [100.0, (0.1, 0.6000000000000001, 500)], [100.0, (0.1, 1.0, 600)], [100.0, (0.1, 1.0, 600)], [100.0, (1.0, 0.9, 300)], [100.0, (1.0, 1.0, 200)], [100.0, (0.003, 0.8, 100)], [100.0, (0.003, 0.8, 700)], [100.0, (0.003, 1.0, 900)], [100.0, (0.03, 0.1, 500)], [100.0, (0.03, 0.1, 600)], [100.0, (0.03, 0.1, 700)], [100.0, (0.03, 0.1, 1000)], [100.0, (0.03, 0.2, 500)], [100.0, (0.03, 0.30000000000000004, 800)], [100.0, (0.3, 0.1, 200)], [100.0, (0.3, 0.1, 500)], [100.0, (0.3, 0.5, 300)]]


In [85]:
# тестирование третьей модели
actual_data = get_actual_data(data, models[2])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 0.03, 0.1, 600, is_sgd)
print get_accuracy(actual, predictions)

93.3333333333


In [122]:
l_rates = [0.001 * 10 ** i for i in xrange(4)]
l_rates += [0.003 * 10 ** i for i in xrange(3)]
reg_coefs = [0.1 * (i + 1) for i in xrange(10)]
iter_counts = [10 * (i + 1) for i in xrange(20)]

In [123]:
# Batch gradient descending
is_sgd=False

In [124]:
# получение оптимальных параметров для первой модели
best = get_best_params(folds, models[0], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best

[[73.33333333333333, (0.1, 0.1, 50)], [73.33333333333333, (0.1, 0.1, 60)], [73.33333333333333, (0.1, 0.1, 70)], [73.33333333333333, (0.1, 0.1, 80)], [73.33333333333333, (0.1, 0.1, 90)], [73.33333333333333, (0.1, 0.1, 100)], [73.33333333333333, (0.1, 0.1, 110)], [73.33333333333333, (0.1, 0.1, 120)], [73.33333333333333, (0.1, 0.1, 130)], [73.33333333333333, (0.1, 0.1, 140)], [73.33333333333333, (0.1, 0.1, 150)], [73.33333333333333, (0.1, 0.1, 160)], [73.33333333333333, (0.1, 0.1, 170)], [73.33333333333333, (0.1, 0.1, 180)], [73.33333333333333, (0.1, 0.2, 110)], [73.33333333333333, (0.1, 0.2, 120)], [73.33333333333333, (0.1, 0.2, 130)], [73.33333333333333, (0.1, 0.2, 140)], [73.33333333333333, (0.1, 0.2, 150)], [73.33333333333333, (0.1, 0.2, 160)], [73.33333333333333, (0.1, 0.2, 170)], [73.33333333333333, (0.1, 0.2, 180)], [73.33333333333333, (0.1, 0.2, 190)], [73.33333333333333, (0.1, 0.2, 200)], [73.33333333333333, (1.0, 0.1, 10)], [73.33333333333333, (1.0, 0.2, 10)], [73.33333333333333

In [136]:
# тестирование первой модели
actual_data = get_actual_data(data, models[0])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 0.1, 0.1, 100, is_sgd)
print get_accuracy(actual, predictions)

80.0


In [137]:
# получение оптимальных параметров для второй модели
best = get_best_params(folds, models[1], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best

[[66.66666666666666, (0.001, 0.1, 10)], [66.66666666666666, (0.001, 0.1, 20)], [66.66666666666666, (0.001, 0.1, 30)], [66.66666666666666, (0.001, 0.1, 40)], [66.66666666666666, (0.001, 0.1, 50)], [66.66666666666666, (0.001, 0.1, 60)], [66.66666666666666, (0.001, 0.1, 70)], [66.66666666666666, (0.001, 0.1, 80)], [66.66666666666666, (0.001, 0.1, 90)], [66.66666666666666, (0.001, 0.1, 100)], [66.66666666666666, (0.001, 0.1, 110)], [66.66666666666666, (0.001, 0.1, 120)], [66.66666666666666, (0.001, 0.1, 130)], [66.66666666666666, (0.001, 0.1, 140)], [66.66666666666666, (0.001, 0.1, 150)], [66.66666666666666, (0.001, 0.1, 160)], [66.66666666666666, (0.001, 0.1, 170)], [66.66666666666666, (0.001, 0.1, 180)], [66.66666666666666, (0.001, 0.1, 190)], [66.66666666666666, (0.001, 0.1, 200)], [66.66666666666666, (0.001, 0.2, 10)], [66.66666666666666, (0.001, 0.2, 20)], [66.66666666666666, (0.001, 0.2, 30)], [66.66666666666666, (0.001, 0.2, 40)], [66.66666666666666, (0.001, 0.2, 50)], [66.666666666

In [138]:
# тестирование второй модели
actual_data = get_actual_data(data, models[1])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 1, 0.7, 400, is_sgd)
print get_accuracy(actual, predictions)

73.3333333333


In [139]:
# получение оптимальных параметров для третьей модели
best = get_best_params(folds, models[2], logistic_regression, l_rates, reg_coefs, iter_counts, is_sgd)
print best

[[80.0, (0.01, 0.1, 130)], [80.0, (0.01, 0.1, 140)], [80.0, (0.01, 0.1, 150)], [80.0, (0.01, 0.1, 160)], [80.0, (0.01, 0.1, 170)], [80.0, (0.01, 0.1, 180)], [80.0, (0.01, 0.1, 190)], [80.0, (0.01, 0.1, 200)], [80.0, (0.01, 0.2, 140)], [80.0, (0.01, 0.2, 150)], [80.0, (0.01, 0.2, 160)], [80.0, (0.01, 0.2, 170)], [80.0, (0.01, 0.2, 180)], [80.0, (0.01, 0.2, 190)], [80.0, (0.01, 0.2, 200)], [80.0, (0.01, 0.30000000000000004, 150)], [80.0, (0.01, 0.30000000000000004, 160)], [80.0, (0.01, 0.30000000000000004, 170)], [80.0, (0.01, 0.30000000000000004, 180)], [80.0, (0.01, 0.30000000000000004, 190)], [80.0, (0.01, 0.30000000000000004, 200)], [80.0, (0.01, 0.4, 160)], [80.0, (0.01, 0.4, 170)], [80.0, (0.01, 0.4, 180)], [80.0, (0.01, 0.4, 190)], [80.0, (0.01, 0.4, 200)], [80.0, (0.01, 0.5, 180)], [80.0, (0.01, 0.5, 190)], [80.0, (0.01, 0.5, 200)], [80.0, (0.1, 0.1, 20)], [80.0, (0.1, 0.2, 20)], [80.0, (0.1, 0.30000000000000004, 20)], [80.0, (0.1, 0.30000000000000004, 30)], [80.0, (0.1, 0.4, 20)

In [168]:
# тестирование второй модели
actual_data = get_actual_data(data, models[1])
train, test = get_training_and_test_set(actual_data)
actual = [row[-1] for row in test]
predictions = logistic_regression(train, test, 0.01, 0.1, 100, is_sgd)
print get_accuracy(actual, predictions)

86.6666666667
