# Series 3, Online Convex Programming

In [85]:
import math

import numpy as np
import pandas as pd

from scipy.linalg import norm
from sklearn.base import BaseEstimator, ClassifierMixin

In [87]:
# Ensure consistency across runs.
np.random.seed(1337)

In [79]:
Xtrain = np.genfromtxt('data/Xtrain.csv', delimiter=',')
Ytrain = np.genfromtxt('data/Ytrain.csv', delimiter=',', dtype='int8')
Xtest = np.genfromtxt('data/Xtest.csv', delimiter=',')
Ytest = np.genfromtxt('data/Ytest.csv', delimiter=',', dtype='int8')

In [84]:
def permute_data(x, y):
    """Shuffles both numpy arrays in unison."""
    perm = np.random.permutation(x.shape[0])
    return x[perm, :], y[perm]

Xtrain, Ytrain = permute_data(Xtrain, Ytrain)
Xtest, Ytest = permute_data(Xtest, Ytest)

In [89]:
from sklearn.utils.estimator_checks import check_estimator

class OnlineSVMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        self.w = None
        self.lbd = 1.0
        self.set_params(**params)
    
    def fit(self, X, y):
        self.w = np.zeros(X.shape[1], dtype='float64')
        
        for t, (x, label) in enumerate(zip(X, y)):
            eta = 1.0 / np.sqrt(t + 1)
            hinge = label * np.inner(self.w, x)
            if hinge < 1:
                self.w = self.w + eta * label * x
                self.project()


        return self
    
    def project(self):
        sqrt_lambda = np.sqrt(self.lbd)
        w_norm = norm(self.w)
        regularizer = 1.0 / (sqrt_lambda * w_norm)
        self.w *= min(1.0, regularizer)
    
    def predict(self, X):
        signs = np.sign(np.inner(self.w, X))
        signs[signs == 0] = -1
        return signs.astype('int8')
    
    def get_params(self, deep=True):
        return {"lbd": self.lbd}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
# check_estimator(OnlineSVMClassifier)

In [90]:
cls = OnlineSVMClassifier()

In [97]:
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

parameters = {
    'lbd': [0.001, 0.005, 0.0075, 0.01, 0.0125, 0.05, 0.1]
}
gs = GridSearchCV(cls, parameters)
gs_result = gs.fit(Xtrain, Ytrain)

print("Best score: %f" % gs_result.best_score_)
print("Best score params: %s" % gs_result.best_params_)

Best score: 0.948321
Best score params: {'lbd': 0.01}


In [None]:
import scipy.stats as stats

rs_params = {
    "lbd": stats.uniform(loc=0.001, scale=0.099)
}
rs_n_iter = 100
rs = RandomizedSearchCV(cls, rs_params, rs_n_iter, n_jobs=2)
rs_result = rs.fit(Xtrain, Ytrain)

print("Best score: %f" % rs_result.best_score_)
print("Best score params: %s" % rs_result.best_params_)