# TP4 - Filtrage Collaboratif

## Imports

In [14]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn import metrics
from sklearn.svm import SVC
from sklearn import cross_validation

## Chargement des données

In [15]:
def loadMovieLens(path='./data100k'):
  # Get movie titles
  movies={}
  for line in open(path+'/u.item'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  # Load data
  prefs={}
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs.setdefault(user,{})
    prefs[user][movies[movieid]]=float(rating)
  return prefs

In [16]:
data = loadMovieLens()

## Séparation en données de Train et de Test

Pour pouvoir séparer les données en ensembles de Train et de Test, on construit la liste des couples (Utilisateurs, Objets) dont on connait les scores.
On extrait ensuite aléatoirement une portion (20%) de ces couples pour les données de test, et le reste sera utilisé en apprentissage.

Comme on ne souhaite ne pas évaluer les objets et les utilisateurs qui n'ont jamais été rencontré en apprentissage, on retire les couples correspondants de l'ensemble de test.

Il faut ensuite a partir de ces couples reconstruire le dictionnaire qui pour chaque utilisateur donne les objets qu'il a noté, ainsi que le dictionnaire qui pour chaque objet donne les utilisateurs qui l'ont noté.

In [17]:
# Recupère une représentation des données sous la forme triplets [user, item, note]
def getCouplesUsersItems(data):
    couples = []
    for u in data.keys():
        for i in data[u].keys():
            couples.append([u,i,data[u][i]])
    return couples

# Split l'ensemble des triplets [user, item, note] en testProp% données de test et (1 - testProp) données de train
def splitTrainTest(couples,testProp):
    perm = np.random.permutation(couples)
    splitIndex = int(testProp * len(couples))
    return perm[splitIndex:], perm[:splitIndex]

# Construit le dictionnaire des utilisateurs a partir des triplets [user, item, note]
def buildUsersDict(couples):
    dicUsers = {}
    for c in couples:
        if not c[0] in dicUsers.keys():
            dicUsers[c[0]] = {}
        dicUsers[c[0]][c[1]] = float(c[2])
    return dicUsers

# Construit le dictionnaire des objets a partir des triplets [user, item, note]
def buildItemsDict(couples):
    dicItems = {}
    for c in couples:
        if not c[1] in dicItems:
            dicItems[c[1]] = {}
        dicItems[c[1]][c[0]] = float(c[2])
    return dicItems

In [19]:
couples = getCouplesUsersItems(data)

trainCouples, testCouples = splitTrainTest(couples,.20)

trainUsers = buildUsersDict(trainCouples)
trainItems = buildItemsDict(trainCouples)

toDel = []

for i,c in enumerate(testCouples):
    if not c[0] in trainUsers:
        toDel.append(i)
    elif not c[1] in trainItems:
        toDel.append(i)

testCouples = np.delete(testCouples, toDel, 0)

testUsers  = buildUsersDict(testCouples)
testItems  = buildItemsDict(testCouples)

#print len(trainUsers), len(testUsers)
#print len(trainItems), len(testItems)

## Baseline 1 : Moyenne par utilisateur

Ce modèle 

In [20]:
class baselineMeanUsers(BaseEstimator,ClassifierMixin):
    def __init__(self):            
        self.mean = {}
    def fit(self, dataUsers):
        self.mean = {}
        for u in dataUsers.keys():
            self.mean[u] = 0
            for i in dataUsers[u].keys():
                self.mean[u] = self.mean[u] + dataUsers[u][i]
            self.mean[u] = self.mean[u] / len(dataUsers[u])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[0]]
        return pred

In [21]:
model1 = baselineMeanUsers()
model1.fit(trainUsers)
pred = model1.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

1.09842596027


## Baseline 2 : Moyenne par item



In [22]:
class baselineMeanItems(BaseEstimator,ClassifierMixin):
    def __init__(self):            
        self.mean = {}
    def fit(self, dataItems):
        self.mean = {}
        for i in dataItems.keys():
            self.mean[i] = 0
            for u in dataItems[i].keys():
                self.mean[i] = self.mean[i] + dataItems[i][u]
            self.mean[i] = self.mean[i] / len(dataItems[i])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[1]]
        return pred

In [23]:
model2 = baselineMeanItems()
model2.fit(trainItems)
pred = model2.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

1.0500907081


## Factorisation de Matrices

Faire varier N la taille de l'espace

In [82]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (i % 10 == 0):
                print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [83]:
model3 = matrixFactorisation(10)
model3.fit(trainUsers, trainItems, trainCouples)
pred = model3.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

0 2.7611026792
10 2.40908351508
20 2.14855326722
30 1.93994333565


KeyboardInterrupt: 

## Factorisation de Matrices avec biais

In [80]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-6, maxIter=1000):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        self.mu = np.random.random() * 2 - 1
        self.loss = []
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k) * 2 - 1
                    self.bu[user] = np.random.rand() * 2 - 1
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1) * 2 - 1
                    self.bi[item] = np.random.rand() * 2 - 1
                tmp = dataUsers[user][item] - (self.mu + self.bi[item] + self.bu[user] + self.p[user].dot(self.q[item])[0][0])
                self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                self.bu[user] = (1 - self.lambd * self.eps) * self.bu[user] + self.eps * 2 * tmp
                self.bi[item] = (1 - self.lambd * self.eps) * self.bi[item] + self.eps * 2 * tmp
                self.mu = (1 - self.lambd * self.eps) * self.mu + self.eps * 2 * tmp
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (i % 10 == 0):
                print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mu + self.bu[c[0]] + self.bi[c[1]] + self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [81]:
model4 = matrixFactorisation(10)
model4.fit(trainUsers, trainItems, trainCouples)
pred = model4.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

1.65863463885
4.65375534569
4.05076313516
5.5234256361
5.10065586414
4.39173908362
4.16207955206
2.01246869523
4.9614363919
1.73651026598
2.21358215161
1.75174113854
5.17293900921
1.30396477477
4.57207546125
1.43442185644
1.54222141152
0.931866750541
6.71328906734
3.35844544975
4.44742663929
2.2981444106
3.30734117482
2.15259495236
3.89773366839
1.77231739445
1.92238973706
4.88250451499
1.54649728779
5.01543803317
6.83022376791
3.44169296817
3.77858444736
5.67267296738
4.73629434347
2.93741926204
3.5352234559
3.25382781794
5.35648408833
5.4843311148
-1.19112916328
2.02024345826
2.98739809271
2.97912504219
3.18348425524
2.67910403923
3.85625766314
3.95917312603
-0.472365451045
5.20167152188
3.64879540205
3.1056457362
5.95271159107
3.56892755377
3.99061619953
6.54586929288
4.58948149419
2.90384280624
3.28842003059
3.01039684518
1.14590262964
2.86824027043
1.32671575384
3.26586538831
2.91254711201
4.15040115731
3.82991390735
5.92471165523
4.29937285199
4.11197370196
2.51080127883
3.242459

KeyboardInterrupt: 

In [62]:
self = model4
pred = np.zeros(len(testCouples))
for ind,c in enumerate(testCouples):
    print self.mu
    print self.bu[c[0]]
    print self.bi[c[1]]
    
    print self.p[c[0]]
    #print self.p[c[0]].dot(self.q[c[1]])
    print self.mu + self.bu[c[0]] + self.bi[c[1]] + self.p[c[0]].dot(self.q[c[1]])[0][0]
    
    pred[ind] = self.mu + self.bu[c[0]] + self.bi[c[1]] + self.p[c[0]].dot(self.q[c[1]])
return pred

3.01457009654
0.159500656008
-0.0261422277277
[[-0.11500515 -0.1952281  -0.66609346 -0.45923279 -0.81687093 -0.05832129
  -0.85131032 -0.2891258   0.89148142 -0.55480397]
 [-0.07831055 -0.15853351 -0.62939886 -0.42253819 -0.78017633 -0.02162669
  -0.81461572 -0.2524312   0.92817602 -0.51810937]
 [ 0.08352307  0.00330012 -0.46756524 -0.26070457 -0.61834271  0.14020693
  -0.6527821  -0.09059757  1.09000965 -0.35627574]
 [-0.05471341 -0.13493636 -0.60580172 -0.39894105 -0.75657919  0.00197045
  -0.79101858 -0.22883405  0.95177316 -0.49451222]
 [ 0.03092152 -0.04930143 -0.52016678 -0.31330611 -0.67094425  0.08760539
  -0.70538364 -0.14319912  1.0374081  -0.40887729]
 [-0.11535707 -0.19558002 -0.66644538 -0.45958471 -0.81722285 -0.05867321
  -0.85166224 -0.28947772  0.8911295  -0.55515589]
 [ 0.01150373 -0.06871922 -0.53958457 -0.33272391 -0.69036205  0.06818759
  -0.72480144 -0.16261691  1.01799031 -0.42829508]
 [-0.10324    -0.18346295 -0.6543283  -0.44746764 -0.80510578 -0.04655614
  -0.

ValueError: setting an array element with a sequence.

In [None]:
i = 2
u = 1
if items[i] in data[users[u]]:
    print data[users[u]][items[i]]
else:
    print 0
print couples

In [None]:
import numpy as np
print np.random.rand(1,10).dot(np.random.rand(10,1))[0][0]
print np.random.random() * 2 - 1

## 