In [1]:
import os
import numpy as np

In [38]:
import urllib
from sklearn.datasets.mldata import fetch_mldata
import tempfile
from mnist import MNIST

class Dataset():
    
    def __init__(self):
        pass
        
    def _print(self, X, y):
        print("shape X", X.shape)
        print("shape y", y.shape)
        unique = np.unique(y)
        print("label", unique)
        print("-1:", (y == -1).sum())
        print("1:", (y == 1).sum())
    
    def load_iris(self):
        test_data_home = tempfile.mkdtemp()
        iris = fetch_mldata('iris', data_home=test_data_home)
        X = iris.data
        y = iris.target
        y[y == 2] = -1 # transformer en 2 labels -1 ou 1
        y[y == 3] = -1
        self._print(X, y)
        return X, y
    
    def load_leuk(self):
        test_data_home = tempfile.mkdtemp()
        leuk = fetch_mldata('leukemia', data_home=test_data_home, transpose_data=True)
        X = leuk.data
        y = leuk.target
        self._print(X, y)
        return X, y
    
    def load_diab(self): # Pirma Indians Diabetes dataset (UCI Machine Learning Rep)
        raw_data = urllib.urlopen("http://goo.gl/j0Rvxq")
        dataset = np.loadtxt(raw_data, delimiter=",")
        X = dataset[:, 0:7]
        y = dataset[:, 8]
        y[y == 0] = -1
        self._print(X, y)
        return X, y
        #os.execute('wget -t inf ' + self.url +)
        
    def load_mnist(self):
        mndata = MNIST('./data')
        X, y = mndata.load_training()
        X = np.array(X)
        y = np.array(y)
        id_sup1 = np.where(y > 1)[0]
        X = np.delete(X, id_sup1, axis=0)
        y = np.delete(y, id_sup1, axis=0)
        y[y == 0] = -1
        self._print(X, y)
        return X, y

    def load_mnist_test(self):
        mndata = MNIST('./data')
        X, y = mndata.load_testing()


In [41]:
dataset = Dataset()
X, y = dataset.load_mnist()
#X, y = dataset.load_leuk()

('shape X', (12665, 784))
('shape y', (12665,))
('label', array([-1,  1]))
('-1:', 5923)
('1:', 6742)


2
60000
60000
784


In [46]:
class Linear():
    
    def __init__(self, iter=100, eps=1e-2, reg=0, debug=False):
        self.iter = iter
        self.eps = eps
        self.reg = reg # coef de régularisation
        self.debug = debug
    
    def fit(self, X, y, Xtest, ytest):
        l = X.shape[0]
        n = X.shape[1]
        self.w = np.random.normal(0, .1, n)
        for i in range(self.iter):
            shuffle = np.random.permutation(l)
            for idx in shuffle:
                err = -2 * (y[idx] - self.w.dot(X[idx]))
                grad = (1 - self.reg) * err * X[idx] + self.grad_L1()
                w_old = np.copy(self.w)
                self.w = self.w - self.eps * grad
                tmp = self.w * w_old
                print(tmp)
                self.w[tmp < 0] = 0
            if self.debug and i%50==0:
                print "#iter", i
                print "err_global", self.err_global(X, y)
                print "accuracy train", self.accuracy(X, y)
                print "accuracy test", self.accuracy(Xtest, ytest)
            break
                
            
    def err_global(self, X, y):
        print(y.shape)
        print(X.shape)
        print(self.w.shape)
        return ((y - X.dot(self.w)) ** 2).sum() / X.shape[0]
    
    def accuracy(self, X, y):
        pred = self.predict(X)
        return np.sum(y == pred)*1. / X.shape[0]
        
    def predict(self, X):
        return np.where( X.dot(self.w) > 0, 1, -1 )
        
    def L0(self):
        return self.w[self.w != 0].shape[0]
    
    def L1(self):
        return np.absolute(self.w).sum()
    
    def grad_L1(self):
        return self.reg * np.sign(self.w)
    
    def L2(self):
        return (self.w ** 2).sum()
    
    def grad_L2(self):
        return 2 * self.reg * self.w # à vérifier

clf = Linear(iter=1, eps=1e-4, reg=.1, debug=True)
clf.fit(X_train, y_train, X_test, y_test)

[  3.46482979e-03   1.66176390e-06   6.79761793e-02   4.08985090e-04
   1.23024825e-02   5.80435622e-04   1.04756982e-02   2.61856537e-03
   1.47289026e-02   8.42593263e-03   5.48639292e-04   1.64401484e-03
   1.00939855e-02   2.09295924e-03   8.74823925e-03   1.20557630e-02
   4.62813573e-03   2.53026248e-03   3.80831830e-02   3.04201177e-03
   5.52954225e-03   4.52965374e-03   1.73366553e-02   4.20122184e-03
   1.28834589e-03   1.94288312e-05   1.99001155e-02   8.90779135e-03
   9.35275707e-04   3.68234297e-03   9.86423366e-04   4.12523520e-03
   1.33750465e-02   6.28056376e-03   1.17934227e-02   1.81499325e-02
   2.24197438e-03   1.81412114e-04   5.23011604e-02   4.25740926e-02
   6.62562547e-03   6.09476760e-02   9.88834476e-04   3.65944248e-03
   1.76819169e-02   1.18746237e-02   6.91915066e-04   6.00926568e-02
   1.34608611e-02   4.83514485e-03   1.40806586e-03   7.89157153e-04
   5.03047165e-04   3.59068003e-03   3.68934136e-02   3.74861465e-05
   1.97439870e-03   6.90639746e-03




[  3.29852707e-003   0.00000000e+000   6.72325581e-002   3.53191122e-004
   1.19873060e-002   5.13576738e-004   1.01850197e-002   2.47425858e-003
   1.43838502e-002   8.16544990e-003   4.83694266e-004   1.53009688e-003
   9.80868959e-003   1.96416237e-003   8.48278256e-003   1.17437835e-002
   4.43561368e-003   2.38844448e-003   3.75271013e-002   2.88631498e-003
   5.31891500e-003   4.33921291e-003   1.69621277e-002   4.01789060e-003
   1.18773527e-003   8.86737283e-006   1.94987066e-002   8.63990631e-003
   8.49855291e-004   3.51083645e-003   8.98643165e-004   3.94358804e-003
   1.30463308e-002   6.05595364e-003   1.14848787e-002   1.77666734e-002
   2.10859973e-003   1.44935868e-004   5.16491391e-002   0.00000000e+000
   0.00000000e+000               inf   9.00944563e-004   3.48847646e-003
   1.73036580e-002   1.15650122e-002   6.18729728e-004   5.93936069e-002
   1.31310861e-002   4.63831906e-003   1.30279154e-003   7.10859100e-004
   4.40945944e-004   3.42134720e-003   3.63461195e



KeyboardInterrupt: 

In [None]:
from sklearn.cross_validation import KFold

cv = KFold(n=X.shape[0], n_folds=5)
for id_train, id_test in cv:
    X_train = X[id_train]
    y_train = y[id_train]
    X_test = X[id_test]
    y_test = y[id_test]
    clf = Linear(iter=100, eps=1e-4, reg=.1, debug=True)
    clf.fit(X_train, y_train, X_test, y_test)
    break

In [35]:
from sklearn.cross_validation import KFold

grid_reg = [.1*i for i in range(11)]
grid_acc_mean = np.zeros(11)
grid_acc_std = np.zeros(11)

for g, reg in enumerate(grid_reg):

    nb_iter = 1
    acc_mean = np.zeros(nb_iter)
    acc_std = np.zeros(nb_iter) 

    for i in range(nb_iter):
        # print "iter:", i
        n_folds = 5
        cv = KFold(n=X.shape[0], n_folds=n_folds)
        acc = np.zeros(n_folds)
        j = 0
        for id_train, id_test in cv:
            X_train = X[id_train]
            y_train = y[id_train]
            X_test = X[id_test]
            y_test = y[id_test]
            clf = Linear(iter=200, eps=1e-5, reg=reg, debug=True)
            clf.fit(X_train, y_train, X_test, y_test)
            acc[j] = clf.accuracy(X_test, y_test)
            # print "prediction accuracy:", acc[j], "\t", "L0:", clf.L0()#, clf.L1(), clf.L2()
            j += 1
        acc_mean[i] = acc.mean()
        acc_std[i] = acc.std()

    grid_acc_mean[g] = acc_mean.mean()
    grid_acc_std[g] = acc_std.mean()
    print "reg", reg
    print "total mean:", grid_acc_mean[g]
    print "total std:", grid_acc_std[g]

#iter 0
err_global 28148.2871197
accuracy train 0.578947368421
accuracy test 0.4
#iter 1
err_global 10086.8484051
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 2
err_global 5957.02679322
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 3
err_global 3793.06544164
accuracy train 0.561403508772
accuracy test 0.466666666667
#iter 4
err_global 2835.49258799
accuracy train 0.543859649123
accuracy test 0.466666666667
#iter 5
err_global 2213.81593863
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 6
err_global 1822.16031747
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 7
err_global 1474.75534794
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 8
err_global 1213.46311489
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 9
err_global 1017.99577306
accuracy train 0.438596491228
accuracy test 0.466666666667
#iter 10
err_global 876.731527383
accuracy train 0.491228070175
accuracy test 0.46666666666

KeyboardInterrupt: 

In [111]:
from sklearn.grid_search import GridSearchCV

parameter_grid = {
    'reg': [.1*i for i in range(11)]
}

# grid_search = GridSearchCV(Linear(iter=100, eps=1e-2), parameter_grid, cv=5, verbose=3)
# grid_search.fit(X, y)

In [10]:
clf.w.shape

(150,)

In [12]:
X.shape

(150, 4)