In [248]:
import numpy as np
import random
from sklearn.datasets.mldata import fetch_mldata
from sklearn import metrics

In [249]:
class LinearL1():
    
    def __init__(self, nb_iter=10000, eps=1e-3, lamb=0):
        self.nb_iter = nb_iter
        self.eps = eps
        self.lamb = lamb
        
    def fit(self, X, Y, Xtest, Ytest):
        l = X.shape[0] # exemple
        n = X.shape[1] # dimension
        self.teta = np.random.normal(0.0, 0.1, n) # loi normale centrée sur 0 avec une dispersion de 0.5
        print self.teta
        for it in range(self.nb_iter):
            for i in range(l) :
                idx = random.randint(0,l-1)
                teta_prim = np.copy(self.teta)
                teta_prim = self.teta - self.eps * self.grad_loss(X[idx], Y[idx])
                for j in range(n) : 
                    if (teta_prim[j] * self.teta[j]) < 0 :
                        self.teta[j] = 0.
                    else :
                        self.teta[j] = teta_prim[j]
        
            print "iteration", it
            print "erreur :", self.loss(X, Y)
            print "accuracy en train", self.accuracy(X, Y)
            print "accuracy en test", self.accuracy(Xtest, Ytest)
            print "-------------"
                
    
    def loss(self, X, Y) :
        l = X.shape[0]
        res = ((Y - X.dot(self.teta))**2).sum()
        return (1./l)* res + self.lamb*np.linalg.norm(self.teta)
    

    def grad_loss(self, x, y) :
        res = -2 * (y - x.dot(self.teta)) * x 
        if np.sign(self.teta).sum() >= 0 :
            return res - self.lamb
        return res + self.lamb 

    def accuracy(self, X, Y):
        pred = self.predict(X)
        return np.sum(Y == pred)*1. / X.shape[0]
        
    def predict(self, X):
        return np.where(X.dot(self.teta) > 0, 1, -1)


In [250]:
def load_data(nom) :
    d = fetch_mldata(nom)
    X = d.data
    Y = d.target
    return X, Y

def detail_data(X, Y) :
    print("X", X.shape)
    print("Y", Y.shape)
    print("labels", np.unique(Y))
    print("-1:", (Y == -1).sum())
    print("1:", (Y == 1).sum())
    print '\n'

def split_data(X, Y, n = 0.80): # par defaut 80% train 20% test
    d = int(X.shape[0] * n)
    Xtrain = X[:d]
    Ytrain = Y[:d]
    Xtest = X[d:]
    Ytest = Y[d:]
    return Xtrain, Ytrain, Xtest, Ytest

X,Y = load_data('leukemia')
detail_data(X, Y)

Xtrain, Ytrain, Xtest, Ytest = split_data(X, Y)
#detail_data(Xtrain, Ytrain)
#detail_data(Xtest, Ytest)


('X', (72, 7129))
('Y', (72,))
('labels', array([-1,  1]))
('-1:', 25)
('1:', 47)




In [251]:
clf = LinearL1(nb_iter=5, eps=1e-8, lamb=0)
clf.fit(Xtrain, Ytrain, Xtest, Ytest)

[ 0.34301424 -0.07184514  0.35645215 ...,  0.52470111 -0.25628301
 -0.03616321]
iteration 0
erreur : 2105.34883526
accuracy en train 0.473684210526
accuracy en test 0.666666666667
-------------
iteration 1
erreur : 2102.87202695
accuracy en train 0.473684210526
accuracy en test 0.666666666667
-------------
iteration 2
erreur : 2099.64630432
accuracy en train 0.473684210526
accuracy en test 0.666666666667
-------------
iteration 3
erreur : 2097.45926594
accuracy en train 0.473684210526
accuracy en test 0.666666666667
-------------
iteration 4
erreur : 2095.81433147
accuracy en train 0.473684210526
accuracy en test 0.666666666667
-------------
