In [1]:
import os
import numpy as np

In [57]:
import urllib
from sklearn.datasets.mldata import fetch_mldata
import tempfile

class Dataset():
    
    def __init__(self):
        pass
        
    def _print(self, X, y):
        print("shape X", X.shape)
        print("shape y", y.shape)
        unique = np.unique(y)
        print("label", unique)
        print("-1:", (y == -1).sum())
        print("1:", (y == 1).sum())
    
    def load_iris(self):
        test_data_home = tempfile.mkdtemp()
        iris = fetch_mldata('iris', data_home=test_data_home)
        X = iris.data
        y = iris.target
        y[y == 2] = -1 # transformer en 2 labels -1 ou 1
        y[y == 3] = -1
        self._print(X, y)
        return X, y
    
    def load_leuk(self):
        test_data_home = tempfile.mkdtemp()
        leuk = fetch_mldata('leukemia', data_home=test_data_home, transpose_data=True)
        X = leuk.data
        y = leuk.target
        self._print(X, y)
        return X, y
    
    def load_diab(self): # Pirma Indians Diabetes dataset (UCI Machine Learning Rep)
        raw_data = urllib.urlopen("http://goo.gl/j0Rvxq")
        dataset = np.loadtxt(raw_data, delimiter=",")
        X = dataset[:, 0:7]
        y = dataset[:, 8]
        y[y == 0] = -1
        self._print(X, y)
        return X, y
        #os.execute('wget -t inf ' + self.url +)

In [59]:
dataset = Dataset()
X, y = dataset.load_leuk()

('shape X', (72, 7129))
('shape y', (72,))
('label', array([-1,  1]))
('-1:', 25)
('1:', 47)


In [56]:
class Linear():
    
    def __init__(self, iter=100, eps=1e-2, reg=0, debug=False):
        self.iter = iter
        self.eps = eps
        self.reg = reg # coef de régularisation
        self.debug = debug
    
    def fit(self, X, y, Xtest, ytest):
        l = X.shape[0]
        n = X.shape[1]
        self.w = np.random.rand(n)
        for i in range(self.iter):
            shuffle = np.random.permutation(l)
            for idx in shuffle:
                err = -2 * (y[idx] - self.w.dot(X[idx]))
                grad = (1 - self.reg) * err * X[idx] + self.grad_L1()
                w_old = np.copy(self.w)
                self.w = self.w - self.eps * grad
                tmp = self.w * w_old
                self.w[tmp < 0] = 0
            if self.debug and i%50==0:
                print "#iter", i
                print "err_global", self.err_global(X, y)
                print "accuracy train", self.accuracy(X, y)
                print "accuracy test", self.accuracy(Xtest, ytest)
                
            
    def err_global(self, X, y):
        return ((y - X.dot(self.w)) ** 2).sum() / X.shape[0]
    
    def accuracy(self, X, y):
        pred = self.predict(X)
        return np.sum(y == pred)*1. / X.shape[0]
        
    def predict(self, X):
        return np.where( X.dot(self.w) > 0, 1, -1 )
        
    def L0(self):
        return self.w[self.w != 0].shape[0]
    
    def L1(self):
        return np.absolute(self.w).sum()
    
    def grad_L1(self):
        return self.reg * np.sign(self.w)
    
    def L2(self):
        return (self.w ** 2).sum()
    
    def grad_L2(self):
        return 2 * self.reg * self.w # à vérifier

from sklearn.cross_validation import KFold

cv = KFold(n=X.shape[0], n_folds=5)
for id_train, id_test in cv:
    X_train = X[id_train]
    y_train = y[id_train]
    X_test = X[id_test]
    y_test = y[id_test]
    clf = Linear(iter=10000, eps=1e-6, reg=0, debug=True)
    clf.fit(X_train, y_train, X_test, y_test)
    break

#iter 0
err_global 86.4777673221
accuracy train 0.371335504886
accuracy test 0.344155844156
#iter 50
err_global 0.91345726923
accuracy train 0.656351791531
accuracy test 0.668831168831
#iter 100
err_global 0.967822068518
accuracy train 0.659609120521
accuracy test 0.655844155844
#iter 150
err_global 0.887617016815
accuracy train 0.657980456026
accuracy test 0.62987012987
#iter 200
err_global 0.892256030827
accuracy train 0.669381107492
accuracy test 0.62987012987
#iter 250
err_global 0.910655852925
accuracy train 0.674267100977
accuracy test 0.649350649351
#iter 300
err_global 0.955785039326
accuracy train 0.661237785016
accuracy test 0.662337662338
#iter 350
err_global 0.87458243634
accuracy train 0.682410423453
accuracy test 0.62987012987
#iter 400
err_global 0.90948039543
accuracy train 0.669381107492
accuracy test 0.655844155844
#iter 450
err_global 0.859080788849
accuracy train 0.679153094463
accuracy test 0.675324675325
#iter 500
err_global 0.85671069691
accuracy train 0.68729641

KeyboardInterrupt: 

In [35]:
from sklearn.cross_validation import KFold

grid_reg = [.1*i for i in range(11)]
grid_acc_mean = np.zeros(11)
grid_acc_std = np.zeros(11)

for g, reg in enumerate(grid_reg):

    nb_iter = 1
    acc_mean = np.zeros(nb_iter)
    acc_std = np.zeros(nb_iter) 

    for i in range(nb_iter):
        # print "iter:", i
        n_folds = 5
        cv = KFold(n=X.shape[0], n_folds=n_folds)
        acc = np.zeros(n_folds)
        j = 0
        for id_train, id_test in cv:
            X_train = X[id_train]
            y_train = y[id_train]
            X_test = X[id_test]
            y_test = y[id_test]
            clf = Linear(iter=200, eps=1e-5, reg=reg, debug=True)
            clf.fit(X_train, y_train, X_test, y_test)
            acc[j] = clf.accuracy(X_test, y_test)
            # print "prediction accuracy:", acc[j], "\t", "L0:", clf.L0()#, clf.L1(), clf.L2()
            j += 1
        acc_mean[i] = acc.mean()
        acc_std[i] = acc.std()

    grid_acc_mean[g] = acc_mean.mean()
    grid_acc_std[g] = acc_std.mean()
    print "reg", reg
    print "total mean:", grid_acc_mean[g]
    print "total std:", grid_acc_std[g]

#iter 0
err_global 28148.2871197
accuracy train 0.578947368421
accuracy test 0.4
#iter 1
err_global 10086.8484051
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 2
err_global 5957.02679322
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 3
err_global 3793.06544164
accuracy train 0.561403508772
accuracy test 0.466666666667
#iter 4
err_global 2835.49258799
accuracy train 0.543859649123
accuracy test 0.466666666667
#iter 5
err_global 2213.81593863
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 6
err_global 1822.16031747
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 7
err_global 1474.75534794
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 8
err_global 1213.46311489
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 9
err_global 1017.99577306
accuracy train 0.438596491228
accuracy test 0.466666666667
#iter 10
err_global 876.731527383
accuracy train 0.491228070175
accuracy test 0.46666666666

KeyboardInterrupt: 

In [111]:
from sklearn.grid_search import GridSearchCV

parameter_grid = {
    'reg': [.1*i for i in range(11)]
}

# grid_search = GridSearchCV(Linear(iter=100, eps=1e-2), parameter_grid, cv=5, verbose=3)
# grid_search.fit(X, y)

In [10]:
clf.w.shape

(150,)

In [12]:
X.shape

(150, 4)