In [1]:
import os
import numpy as np

In [38]:
import urllib
from sklearn.datasets.mldata import fetch_mldata
import tempfile
from mnist import MNIST

class Dataset():
    
    def __init__(self):
        pass
        
    def _print(self, X, y):
        print("shape X", X.shape)
        print("shape y", y.shape)
        unique = np.unique(y)
        print("label", unique)
        print("-1:", (y == -1).sum())
        print("1:", (y == 1).sum())
    
    def load_iris(self):
        test_data_home = tempfile.mkdtemp()
        iris = fetch_mldata('iris', data_home=test_data_home)
        X = iris.data
        y = iris.target
        y[y == 2] = -1 # transformer en 2 labels -1 ou 1
        y[y == 3] = -1
        self._print(X, y)
        return X, y
    
    def load_leuk(self):
        test_data_home = tempfile.mkdtemp()
        leuk = fetch_mldata('leukemia', data_home=test_data_home, transpose_data=True)
        X = leuk.data
        y = leuk.target
        self._print(X, y)
        return X, y
    
    def load_diab(self): # Pirma Indians Diabetes dataset (UCI Machine Learning Rep)
        raw_data = urllib.urlopen("http://goo.gl/j0Rvxq")
        dataset = np.loadtxt(raw_data, delimiter=",")
        X = dataset[:, 0:7]
        y = dataset[:, 8]
        y[y == 0] = -1
        self._print(X, y)
        return X, y
        #os.execute('wget -t inf ' + self.url +)
        
    def load_mnist(self):
        mndata = MNIST('./data')
        X, y = mndata.load_training()
        X = np.array(X)
        y = np.array(y)
        id_sup1 = np.where(y > 1)[0]
        X = np.delete(X, id_sup1, axis=0)
        y = np.delete(y, id_sup1, axis=0)
        y[y == 0] = -1
        self._print(X, y)
        return X, y

    def load_mnist_test(self):
        mndata = MNIST('./data')
        X, y = mndata.load_testing()


In [41]:
dataset = Dataset()
X, y = dataset.load_mnist()
#X, y = dataset.load_leuk()

('shape X', (12665, 784))
('shape y', (12665,))
('label', array([-1,  1]))
('-1:', 5923)
('1:', 6742)


2
60000
60000
784


In [45]:
class Linear():
    
    def __init__(self, iter=100, eps=1e-2, reg=0, debug=False):
        self.iter = iter
        self.eps = eps
        self.reg = reg # coef de régularisation
        self.debug = debug
    
    def fit(self, X, y, Xtest, ytest):
        l = X.shape[0]
        n = X.shape[1]
        self.w = np.random.normal(0, .1, n)
        for i in range(self.iter):
            shuffle = np.random.permutation(l)
            for idx in shuffle:
                err = -2 * (y[idx] - self.w.dot(X[idx]))
                grad = (1 - self.reg) * err * X[idx] + self.grad_L1()
                w_old = np.copy(self.w)
                self.w = self.w - self.eps * grad
                tmp = self.w * w_old
                print(tmp)
                self.w[tmp < 0] = 0
            if self.debug and i%50==0:
                print "#iter", i
                print "err_global", self.err_global(X, y)
                print "accuracy train", self.accuracy(X, y)
                print "accuracy test", self.accuracy(Xtest, ytest)
            break
                
            
    def err_global(self, X, y):
        print(y.shape)
        print(X.shape)
        print(self.w.shape)
        return ((y - X.dot(self.w)) ** 2).sum() / X.shape[0]
    
    def accuracy(self, X, y):
        pred = self.predict(X)
        return np.sum(y == pred)*1. / X.shape[0]
        
    def predict(self, X):
        return np.where( X.dot(self.w) > 0, 1, -1 )
        
    def L0(self):
        return self.w[self.w != 0].shape[0]
    
    def L1(self):
        return np.absolute(self.w).sum()
    
    def grad_L1(self):
        return self.reg * np.sign(self.w)
    
    def L2(self):
        return (self.w ** 2).sum()
    
    def grad_L2(self):
        return 2 * self.reg * self.w # à vérifier

clf = Linear(iter=100, eps=1e-4, reg=.1, debug=True)
clf.fit(X_train, y_train, X_test, y_test)

[  3.39236745e-03   2.38113339e-05   8.91213791e-04   1.95215486e-03
   1.24412347e-03   1.93779104e-03   4.06351494e-02   2.85344205e-03
   5.54924014e-04   2.36855269e-02   5.39851064e-04   4.63895420e-03
   5.78209712e-04   2.62923263e-02   2.00082385e-02   4.99396033e-03
   6.62991689e-03   4.22553231e-02   2.17253052e-03   1.17182295e-02
   3.86540590e-03   2.83858743e-02   2.50657749e-02   5.99777529e-03
   4.20936044e-04   4.74854261e-03   5.25387863e-03   1.68817475e-03
   2.46202917e-02   1.36566760e-02   4.03996200e-03   4.53410655e-03
   1.14464699e-03   6.98193796e-02   1.43046743e-02   9.38317780e-03
   1.81936432e-02   1.35204587e-03   1.44686329e-03   4.51576418e-04
   1.36790263e-02   1.56841496e-02   2.17115663e-02   9.54907531e-04
   2.47073824e-02   1.09093109e-03   4.01427622e-02   5.22622923e-02
   4.25279063e-03   1.19229837e-03   4.53403958e-03   1.27730492e-04
   2.39129514e-03   7.42402592e-03   7.73721441e-03   5.77052166e-02
   1.22302160e-03   4.00797979e-02




[  3.23351832e-03   1.22477939e-05   8.10723347e-04   1.83211363e-03
   1.14867678e-03   1.81819928e-03   4.00806886e-02   2.70791382e-03
   4.91811575e-04   2.32626644e-02   4.77627703e-04   4.45287536e-03
   5.13747173e-04   2.58466992e-02   1.96197395e-02   4.80082117e-03
   6.40709022e-03   4.16898792e-02   2.04579018e-03   1.14213618e-02
   3.69571451e-03   2.79227709e-02   2.46307113e-02   5.78593065e-03
   3.66214279e-04   4.56025632e-03   5.05572817e-03   1.57667791e-03
   2.41891285e-02   1.33360419e-02   3.86643888e-03   4.35016421e-03
   1.05317332e-03   6.90919994e-02   1.39764768e-02   9.11772982e-03
   1.78232683e-02   1.25246459e-03   1.34378375e-03   3.94829912e-04
   1.33581284e-02   1.53404016e-02   2.13067890e-02   8.71523574e-04
   2.42754539e-02   1.00167477e-03   3.95916824e-02   5.16332345e-02
   4.07470598e-03   1.09890087e-03   4.35009861e-03   9.84419447e-05
   2.25823313e-03   7.18812103e-03   7.49634524e-03   5.70441162e-02
   1.12840404e-03   3.95291519e-0



KeyboardInterrupt: 

In [None]:
from sklearn.cross_validation import KFold

cv = KFold(n=X.shape[0], n_folds=5)
for id_train, id_test in cv:
    X_train = X[id_train]
    y_train = y[id_train]
    X_test = X[id_test]
    y_test = y[id_test]
    clf = Linear(iter=100, eps=1e-4, reg=.1, debug=True)
    clf.fit(X_train, y_train, X_test, y_test)
    break

In [35]:
from sklearn.cross_validation import KFold

grid_reg = [.1*i for i in range(11)]
grid_acc_mean = np.zeros(11)
grid_acc_std = np.zeros(11)

for g, reg in enumerate(grid_reg):

    nb_iter = 1
    acc_mean = np.zeros(nb_iter)
    acc_std = np.zeros(nb_iter) 

    for i in range(nb_iter):
        # print "iter:", i
        n_folds = 5
        cv = KFold(n=X.shape[0], n_folds=n_folds)
        acc = np.zeros(n_folds)
        j = 0
        for id_train, id_test in cv:
            X_train = X[id_train]
            y_train = y[id_train]
            X_test = X[id_test]
            y_test = y[id_test]
            clf = Linear(iter=200, eps=1e-5, reg=reg, debug=True)
            clf.fit(X_train, y_train, X_test, y_test)
            acc[j] = clf.accuracy(X_test, y_test)
            # print "prediction accuracy:", acc[j], "\t", "L0:", clf.L0()#, clf.L1(), clf.L2()
            j += 1
        acc_mean[i] = acc.mean()
        acc_std[i] = acc.std()

    grid_acc_mean[g] = acc_mean.mean()
    grid_acc_std[g] = acc_std.mean()
    print "reg", reg
    print "total mean:", grid_acc_mean[g]
    print "total std:", grid_acc_std[g]

#iter 0
err_global 28148.2871197
accuracy train 0.578947368421
accuracy test 0.4
#iter 1
err_global 10086.8484051
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 2
err_global 5957.02679322
accuracy train 0.526315789474
accuracy test 0.466666666667
#iter 3
err_global 3793.06544164
accuracy train 0.561403508772
accuracy test 0.466666666667
#iter 4
err_global 2835.49258799
accuracy train 0.543859649123
accuracy test 0.466666666667
#iter 5
err_global 2213.81593863
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 6
err_global 1822.16031747
accuracy train 0.508771929825
accuracy test 0.466666666667
#iter 7
err_global 1474.75534794
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 8
err_global 1213.46311489
accuracy train 0.456140350877
accuracy test 0.466666666667
#iter 9
err_global 1017.99577306
accuracy train 0.438596491228
accuracy test 0.466666666667
#iter 10
err_global 876.731527383
accuracy train 0.491228070175
accuracy test 0.46666666666

KeyboardInterrupt: 

In [111]:
from sklearn.grid_search import GridSearchCV

parameter_grid = {
    'reg': [.1*i for i in range(11)]
}

# grid_search = GridSearchCV(Linear(iter=100, eps=1e-2), parameter_grid, cv=5, verbose=3)
# grid_search.fit(X, y)

In [10]:
clf.w.shape

(150,)

In [12]:
X.shape

(150, 4)