# TP4 - Filtrage Collaboratif

## Imports

In [4]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn import metrics
from sklearn.svm import SVC
from sklearn import cross_validation

## Chargement des données

In [5]:
def loadMovieLens(path='./data100k'):
  # Get movie titles
  movies={}
  for line in open(path+'/u.item'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  # Load data
  prefs={}
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs.setdefault(user,{})
    prefs[user][movies[movieid]]=float(rating)
  return prefs

In [6]:
data = loadMovieLens()

## Séparation en données de Train et de Test

Pour pouvoir séparer les données en ensembles de Train et de Test, on construit la liste des couples (Utilisateurs, Objets) dont on connait les scores.
On extrait ensuite aléatoirement une portion (20%) de ces couples pour les données de test, et le reste sera utilisé en apprentissage.

Comme on ne souhaite ne pas évaluer les objets et les utilisateurs qui n'ont jamais été rencontré en apprentissage, on retire les couples correspondants de l'ensemble de test.

Il faut ensuite a partir de ces couples reconstruire le dictionnaire qui pour chaque utilisateur donne les objets qu'il a noté, ainsi que le dictionnaire qui pour chaque objet donne les utilisateurs qui l'ont noté.

In [7]:
def getCouplesUsersItems(data):
    couples = []
    for u in data.keys():
        for i in data[u].keys():
            couples.append([u,i])
    return couples

def splitTrainTest(couples,testProp):
    perm = np.random.permutation(couples)
    splitIndex = int(testProp * len(couples))
    return perm[splitIndex:], perm[:splitIndex]

def buildUsersDict(data,couples=None):
    if (couples == None):
        couples = getCouplesUsersItems(data)
    dicUsers = {}
    for c in couples:
        if not c[0] in dicUsers.keys():
            dicUsers[c[0]] = {}
        dicUsers[c[0]][c[1]] = data[c[0]][c[1]]
    return dicUsers

def buildItemsDict(data,couples=None):
    if (couples == None):
        couples = getCouplesUsersItems(data)
    dicItems = {}
    for c in couples:
        if not c[1] in dicItems:
            dicItems[c[1]] = {}
        dicItems[c[1]][c[0]] = data[c[0]][c[1]]
    return dicItems

In [8]:
couples = getCouplesUsersItems(data)

trainCouples, testCouples = splitTrainTest(couples,.20)

trainUsers = buildUsersDict(data, trainCouples)
trainItems = buildItemsDict(data, trainCouples)

toDel = []

for i,c in enumerate(testCouples):
    if not c[0] in trainUsers:
        toDel.append(i)
    elif not c[1] in trainItems:
        toDel.append(i)

testCouples = np.delete(testCouples, toDel, 0)

testUsers  = buildUsersDict(data, testCouples)
testItems  = buildItemsDict(data, testCouples)

#print len(trainUsers), len(testUsers)
#print len(trainItems), len(testItems)

## Baseline 1 : Moyenne par utilisateur

Ce modèle 

In [9]:
class baselineMeanUsers(BaseEstimator,ClassifierMixin):
    def __init__(self):            
        self.mean = {}
    def fit(self, dataUsers):
        self.mean = {}
        for u in dataUsers.keys():
            self.mean[u] = 0
            for i in dataUsers[u].keys():
                self.mean[u] = self.mean[u] + dataUsers[u][i]
            self.mean[u] = self.mean[u] / len(dataUsers[u])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[0]]
        return pred

In [10]:
model1 = baselineMeanUsers()
model1.fit(trainUsers)
pred = model1.predict(testCouples)
print pred

[ 2.87795276  3.95505618  3.95950156 ...,  3.9375      3.75824176
  3.57894737]


## Baseline 2 : Moyenne par item



In [11]:
class baselineMeanItems(BaseEstimator,ClassifierMixin):
    def __init__(self):            
        self.mean = {}
    def fit(self, dataItems):
        self.mean = {}
        for i in dataItems.keys():
            self.mean[i] = 0
            for u in dataItems[i].keys():
                self.mean[i] = self.mean[i] + dataItems[i][u]
            self.mean[i] = self.mean[i] / len(dataItems[i])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[1]]
        return pred

In [12]:
model2 = baselineMeanItems()
model2.fit(trainItems)
pred = model2.predict(testCouples)
print pred

[ 3.81108312  3.26950355  2.84375    ...,  3.92405063  4.1         3.54491018]


## Factorisation de Matrices

Faire varier N la taille de l'espace

In [30]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.1, eps=1e-5, maxIter=1000):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
    def fit(self, dataUsers, dataItems):
        self.p = {}
        self.q = {}
        self.loss = []
        for i in xrange(1000):
            loss = 0
            for user in dataUsers.keys():
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                for item in dataUsers[user].keys():
                    if not item in self.q:
                        self.q[item] = np.random.rand(self.k,1)
                    tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item]
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user]
                    #self.p[user] = (1 - self.lambd * self.eps) * self.p[user] - self.eps * 2 * tmp * self.q[item]
                    #self.q[item] = (1 - self.lambd * self.eps) * self.q[item] - self.eps * 2 * tmp * self.p[user]
                    loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (i % 10 == 0):
                print i, loss
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])

In [31]:
model3 = matrixFactorisation(10)
model3.fit(trainUsers, trainItems)
pred = model3.predict(testCouples)
print pred

0 234222.924798
10 203548.029487
20 179483.066694
30 160742.073762
40 146182.272231
50 134845.056737
60 125961.10847
70 118932.949467
80 113307.176138
90 108744.836121
100 104994.595042
110 101870.552871
120 99234.9227778
130 96984.9995591
140 95043.587144
150 93352.068615
160 91865.4272423
170 90548.6774822
180 89374.3021428
190 88320.403088
200 87369.3572141
210 86506.8309932
220 85721.0507314
230 85002.2564653
240 84342.2888575
250 83734.2733285
260 83172.3759976
270 82651.6131999
280 82167.7013922
290 81716.9378135
300 81296.1047931
310 80902.3924133


KeyboardInterrupt: 

## Factorisation de Matrices avec biais

In [44]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.1, eps=1e-5, maxIter=1000):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
    def fit(self, dataUsers, dataItems):
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        self.mu = np.random.random() * 2 - 1
        self.loss = []
        for i in xrange(1000):
            loss = 0
            for user in dataUsers.keys():
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k) * 2 - 1
                    self.bu[user] = np.random.rand() * 2 - 1
                for item in dataUsers[user].keys():
                    if not item in self.q:
                        self.q[item] = np.random.rand(self.k,1) * 2 - 1
                        self.bi[item] = np.random.rand() * 2 - 1
                    tmp = dataUsers[user][item] - (self.mu + self.bi[item] + self.bu[user] + self.p[user].dot(self.q[item])[0][0])
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item]
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user]
                    self.bu[user] = (1 - self.lambd * self.eps) * self.bu[user] + self.eps * 2 * tmp
                    self.bi[item] = (1 - self.lambd * self.eps) * self.bi[item] + self.eps * 2 * tmp
                    self.mu = (1 - self.lambd * self.eps) * self.mu + self.eps * 2 * tmp
                    #self.p[user] = (1 - self.lambd * self.eps) * self.p[user] - self.eps * 2 * tmp * self.q[item]
                    #self.q[item] = (1 - self.lambd * self.eps) * self.q[item] - self.eps * 2 * tmp * self.p[user]
                    loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (i % 10 == 0):
                print i, loss
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mu + self.bi[c[0]] + self.bi[c[1]] + self.p[c[0]].dot(self.q[c[1]])

In [45]:
model4 = matrixFactorisation(10)
model4.fit(trainUsers, trainItems)
pred = model4.predict(testCouples)
print pred

0 547034.284413
10 244212.961125
20 238170.674278
30 232796.285584
40 228012.594049
50 223755.878866
60 219973.524692
70 216622.185908
80 213666.369797
90 211077.353156
100 208832.370795
110 206914.03281
120 205309.942387
130 204012.498662
140 203018.881106


KeyboardInterrupt: 

In [None]:
i = 2
u = 1
if items[i] in data[users[u]]:
    print data[users[u]][items[i]]
else:
    print 0
print couples

In [43]:
import numpy as np
print np.random.rand(1,10).dot(np.random.rand(10,1))[0][0]
print np.random.random() * 2 - 1

1.72955982213
0.505221885261


## 