In [1]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from math import log, exp, sqrt

In [2]:
data = pd.read_csv('data/data-logistic.csv', header=None)
data.columns = ['Class', 'x1', 'x2']

In [3]:
data.head()

Unnamed: 0,Class,x1,x2
0,-1,-0.663827,-0.138526
1,1,1.994596,2.468025
2,-1,-1.247395,0.749425
3,1,2.309374,1.899836
4,1,0.849143,2.40775


**Функция ошибки.**

In [4]:
def log_error(df, w1, w2, reg_param):
    margins = df['Class']*(w1*df['x1'] + w2*df['x2'])
    return margins.apply(lambda x: log(1 + exp(-x))).sum() / float(df.shape[0]) +\
            0.5 * reg_param * sqrt(w1 ** 2 + w2 ** 2)

In [5]:
print(log_error(data, 0, 0, 0.1))

0.69314718056


**Ответ алгоритма.**

In [6]:
def a(df, w1, w2):
    return (w1*df['x1'] + w2*df['x2']).apply(lambda x: 1. / (1 + exp(-x)))

**Градиентный спуск.**

In [7]:
def grad_step(df, w1, w2, reg_param, step, regularize=True):
    margins = data['Class']*(w1*data['x1'] + w2*data['x2'])
    margins = margins.apply(lambda x: 1 - 1./(1 + exp(-x)))
    x1_grad = (data['Class'] * data['x1'] * margins).sum() / float(df.shape[0])
    x2_grad = (data['Class'] * data['x2'] * margins).sum() / float(df.shape[0])
    new_w1 = w1 + step * x1_grad
    new_w2 = w2 + step * x2_grad
    if regularize:
        new_w1 -= step * reg_param * w1
        new_w2 -= step * reg_param * w2
    return new_w1, new_w2

In [8]:
def grad_descent(df, error_func, 
                 init_point=(0,0), step=0.1, regularize=True,
                max_iter=10000, max_error=1e-5, reg_param=0.1):
    w1, w2 = init_point
    for i in range(max_iter):
        w1, w2 = grad_step(df, w1, w2, reg_param, step,
                          regularize=regularize)
        new_error = error_func(df, w1, w2, reg_param)
        if new_error <= max_error:
            return w1, w2, new_error
    return w1, w2, new_error

**Запуск без регуляризации.**

In [10]:
w1, w2, err = grad_descent(data, log_error, step=0.1, regularize=False,
                           init_point=(0, 0), max_iter=10000)

In [11]:
w1, w2, err

(0.2881081945770693, 0.09170910047596165, 0.6536875325355475)

In [12]:
roc_auc_score(data['Class'], a(data[['x1', 'x2']], w1, w2))

0.92676190476190468

**Запуск с регуляризацией.**

In [13]:
w1, w2, err = grad_descent(data, log_error, step=0.1,
                           init_point=(0, 0), 
                           regularize=True, max_iter=10000)

In [14]:
w1, w2, err

(0.24122089712619665, 0.1051394906743666, 0.6522166471591115)

In [15]:
roc_auc_score(data['Class'], a(data[['x1', 'x2']], w1, w2))

0.93104761904761901