In [592]:
import math
import numpy as np
import random

def sigmoid(x):
    return float(1) / (1 + math.exp(-x))        


def logit(w, x, y, lambd):
    return math.log(1 + math.exp(-1*y*np.dot(w, x))) + lambd * math.sqrt(sum([x ** 2 for x in w]))
                                                                         
                                                                         
def logit_loss_partial_deriv(w, samples, j, lambd):
    norm = 1
    tikhonov_deriv = (lambd * w[j] / norm) if norm > 0 else 0
    return np.average([s[-1] * s[j] * (sigmoid(s[-1] * np.dot(w, s[:-1])) - 1) for s in samples]) + tikhonov_deriv


def logit_loss_gradient(w, samples, lambd):
    d = len(samples[0]) - 1
    return [logit_loss_partial_deriv(w, samples, j, lambd) for j in xrange(d)]

def logit_loss_one_var_gradient(w, samples, lambd, j):
    one_sample = [samples[j]]
    return logit_loss_gradient(w, one_sample, lambd)


In [593]:
YES_LABEL = 1
NO_LABEL = -1

def read_data_from_file(filename):
    samples = []
    classes = set()
    with open(filename) as f:
        for line in f:
            line_splitted = line.strip().split(",")
            samples.append(([float(x) for x in line_splitted[:-1]], line_splitted[-1]))
            classes.add(line_splitted[-1])
    return samples, classes


def mark_set(samples, class_label):
    marked = []
    for s in samples:
        ts = [1] + list(s[0]) # make homogenous
        ts.extend([YES_LABEL if s[1] == class_label else NO_LABEL])
        marked.append(ts)
    return marked


In [594]:
def batch_gd(samples, T, nu, lambd):
    d = len(samples[0]) -1
    w = np.zeros(d)
    w_history = [w]
    
    for t in xrange(T):
        vt = logit_loss_gradient(w, samples, lambd)
        w = np.subtract(w, np.dot(nu, vt))
        w_history.append(w)
        
    w_res = np.zeros(d)
    for w_h in w_history:
        w_res = np.add(w_res, w_h)
        
    return w_history[-1]

def stochastic_gd(samples, T, nu, lambd):
    d = len(samples[0]) - 1
    w = np.zeros(d)
    w_history = [w]
    
    for t in xrange(T):
        vt = logit_loss_one_var_gradient(w, samples, lambd, random.randint(0, len(samples) - 1))
        w = np.subtract(w, np.dot(nu, vt))
        w_history.append(w)
        
    # w_res = np.zeros(d)
    # for w_h in w_history:
    #     w_res = np.add(w_res, w_h)
        
    return w_history[-1]


def test_sample(x, y, w):
    prediction = sigmoid(np.dot(w,x))
#     print "x=", x, ",y=", y, ",prediction=", prediction
    if (prediction >= 0.5 and y == YES_LABEL) or (prediction < 0.5 and y == NO_LABEL):
        return 1
    return 0


def test_samples(samples_for_test, w):
    predicted = sum( [test_sample(x[:-1], x[-1], w) for x in samples_for_test])
    return float(predicted) / len(samples_for_test)
    

In [595]:
T_RANGE = xrange(100, 101, 3000)
LAMBDA_RANGE = np.arange(0.1, 1, 2)
NU_RANGE = np.arange(0.1, 1, 2)
K = 10

def k_fold(samples, k, t_range, lambda_range, nu_range):
    block_size = int(math.floor(float(len(samples)) / k))
    
    T_best = 0
    lambda_best = 0
    nu_best = 0
    predicted_best = 0
    T_best_st = 0
    lambda_best_st = 0
    nu_best_st = 0
    predicted_best_st = 0
    
    for T in t_range:
        for l in lambda_range:
            for nu in nu_range:
                 for ind in xrange(0, len(samples), block_size):
                    fold_test = samples[ind:min(ind+block_size, len(samples))]
                    fold_train = [x for x in samples if x not in fold_test]
                    
                    w_fold_trained = batch_gd(fold_train, T, nu, l)
                    predicted = test_samples(fold_test, w_fold_trained)
                    if predicted > predicted_best:
                        T_best = T
                        lambda_best = l
                        nu_best = nu
                        predicted_best = predicted
                       
                    w_fold_trained_st = stochastic_gd(fold_train, T, nu, l)
                    predicted_st = test_samples(fold_test, w_fold_trained)
                    if predicted_st > predicted_best_st:
                        T_best_st = T
                        lambda_best_st = l
                        nu_best_st = nu
                        predicted_best_st = predicted
                    
                        
                    print "T=", T, ",l=", l, ",nu=", nu, ",ind=", ind, ",predicted=", predicted

    
    return T_best, lambda_best, nu_best, T_best_st, lambda_best_st, nu_best_st


In [598]:
data, classes = read_data_from_file("iris.data")

for c in classes:
    marked_data = mark_set(data, c)

    np.random.shuffle(marked_data)
    train_length = int(math.floor(len(marked_data) * 0.9))
    train_data = marked_data[:train_length]
    test_data = marked_data[train_length:]

    T, lambd, nu, T_st, lambd_st, nu_st = k_fold(train_data, K, T_RANGE, LAMBDA_RANGE, NU_RANGE)
    
    print "batch: c=", c, ", T=", T, ", l=", lambd, ", nu=", nu
    
    w_trained = batch_gd(train_data, T, nu, lambd)
    predict = test_samples(test_data, w_trained)
    print "batch: ", w_trained, predict
    
    print "stochastic: c=", c, ", T=", T_st, ", l=", lambd_st, ", nu=", nu_st
    
    w_trained_st = stochastic_gd(train_data, T, nu, lambd)
    predict_st = test_samples(test_data, w_trained_st)
    print "stochastic: ", w_trained_st, predict_st



T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 0 ,predicted= 0.846153846154
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 13 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 26 ,predicted= 0.769230769231
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 39 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 52 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 65 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 78 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 91 ,predicted= 0.846153846154
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 104 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 117 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 130 ,predicted= 1.0
batch: c= Iris-virginica , T= 100 , l= 0.1 , nu= 0.1
batch:  [-0.21062496 -0.46112565 -0.48946341  0.74471786  0.53492473] 0.97037037037
stochastic: c= Iris-virginica , T= 100 , l= 0.1 , nu= 0.1
stochastic:  [-0.22182248 -0.28550741 -0.60407128  1.5137761   1.00224186] 0.659259259259
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 0 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 13 ,predicted= 1.0
T= 100 ,l= 0.1 ,nu= 0.1 ,ind= 26 ,predicted=

In [597]:
from sklearn.linear_model import SGDClassifier

X = [x[:-1] for x in train_data]
X_test = [x[:-1] for x in test_data]
y = [x[-1] for x in train_data]
y_test = [x[-1] for x in test_data]
clf = SGDClassifier(loss="log", penalty="l2")
clf.fit(X, y)
prediction = clf.predict(X_test)
original = y_test
print prediction
print np.array(y_test)
print clf

predicted = sum([1 if (x[0]*x[1] > 0) else 0 for x in zip(prediction, original)])
print float(predicted) / len(prediction)


[-1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[ 1  1  1  1 -1  1 -1 -1 -1 -1  1 -1 -1  1 -1]
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
0.6
