# TP4 - Filtrage Collaboratif

## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt

## Chargement des données

In [2]:
def loadMovieLens(path='./data100k'):
    # Get movie titles
    movies={}
    for line in open(path+'/u.item'):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
    # Load data
    prefs={}
    times={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        times.setdefault(user,{})
        times[user][movies[movieid]]=float(ts)
    return prefs, times

In [3]:
data, timestamps = loadMovieLens()

## Séparation en données de Train et de Test

Pour pouvoir séparer les données en ensembles de Train et de Test, on construit la liste des couples (Utilisateurs, Objets) dont on connait les scores.
On extrait ensuite aléatoirement une portion (20%) de ces couples pour les données de test, et le reste sera utilisé en apprentissage.

Comme on ne souhaite ne pas évaluer les objets et les utilisateurs qui n'ont jamais été rencontré en apprentissage, on retire les couples correspondants de l'ensemble de test.

Il faut ensuite a partir de ces couples reconstruire le dictionnaire qui pour chaque utilisateur donne les objets qu'il a noté, ainsi que le dictionnaire qui pour chaque objet donne les utilisateurs qui l'ont noté.

In [4]:
# Recupère une représentation des données sous la forme triplets [user, item, note]
def getCouplesUsersItems(data):
    couples = []
    for u in data.keys():
        for i in data[u].keys():
            couples.append([u,i,data[u][i]])
    return couples

# Split l'ensemble des triplets [user, item, note] en testProp% données de test et (1 - testProp) données de train
def splitTrainTest(couples,testProp):
    perm = np.random.permutation(couples)
    splitIndex = int(testProp * len(couples))
    return perm[splitIndex:], perm[:splitIndex]

# Construit le dictionnaire des utilisateurs a partir des triplets [user, item, note]
def buildUsersDict(couples):
    dicUsers = {}
    for c in couples:
        if not c[0] in dicUsers.keys():
            dicUsers[c[0]] = {}
        dicUsers[c[0]][c[1]] = float(c[2])
    return dicUsers

# Construit le dictionnaire des objets a partir des triplets [user, item, note]
def buildItemsDict(couples):
    dicItems = {}
    for c in couples:
        if not c[1] in dicItems:
            dicItems[c[1]] = {}
        dicItems[c[1]][c[0]] = float(c[2])
    return dicItems

In [5]:
couples = getCouplesUsersItems(data)

trainCouples, testCouples = splitTrainTest(couples,.20)

trainUsers = buildUsersDict(trainCouples)
trainItems = buildItemsDict(trainCouples)

toDel = []

for i,c in enumerate(testCouples):
    if not c[0] in trainUsers:
        toDel.append(i)
    elif not c[1] in trainItems:
        toDel.append(i)

testCouples = np.delete(testCouples, toDel, 0)

testUsers  = buildUsersDict(testCouples)
testItems  = buildItemsDict(testCouples)

#print len(trainUsers), len(testUsers)
#print len(trainItems), len(testItems)

## Baseline 1 : Moyenne par utilisateur

Ce modèle 

In [6]:
class baselineMeanUsers():
    def __init__(self):            
        self.mean = {}
    def fit(self, dataUsers):
        self.mean = {}
        for u in dataUsers.keys():
            self.mean[u] = 0
            for i in dataUsers[u].keys():
                self.mean[u] = self.mean[u] + dataUsers[u][i]
            self.mean[u] = self.mean[u] / len(dataUsers[u])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[0]]
        return pred

In [7]:
model1 = baselineMeanUsers()
model1.fit(trainUsers)
pred = model1.predict(testCouples)
print "erreur en test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

erreur en test: 1.08572773402


## Baseline 2 : Moyenne par item



In [8]:
class baselineMeanItems():
    def __init__(self):            
        self.mean = {}
    def fit(self, dataItems):
        self.mean = {}
        for i in dataItems.keys():
            self.mean[i] = 0
            for u in dataItems[i].keys():
                self.mean[i] = self.mean[i] + dataItems[i][u]
            self.mean[i] = self.mean[i] / len(dataItems[i])
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mean[c[1]]
        return pred

In [9]:
model2 = baselineMeanItems()
model2.fit(trainItems)
pred = model2.predict(testCouples)
print "erreur en test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

erreur en test: 1.03931944836


## Factorisation de Matrices

Faire varier N la taille de l'espace

In [10]:
class matrixFactorisation():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=2000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.couples = couples
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k)
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1)
                tmp = dataUsers[user][item] - self.p[user].dot(self.q[item])[0][0]
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    optimP = optimQ
                    optimQ = 1 - optimQ
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [18]:
model3 = matrixFactorisation(10, alternate=0)
model3.fit(trainUsers, trainItems, trainCouples)

0 2.829537706
100 1.30712001798
200 1.07948970774
300 0.997287439085
400 0.952657605705
500 0.917643772482
600 0.89365902144
700 0.888028692502
800 0.869048170694
900 0.868529460105
1000 0.858853615837
1100 0.863836741598
1200 0.852722256483
1300 0.842192524637
1400 0.841983968252
1500 0.844243257703
1600 0.837896616117
1700 0.837744282998
1800 0.832008944357
1900 0.83920613112


In [29]:
plt.figure()
plt.plot(model3.loss)
plt.show()

In [20]:
pred = model3.predict(testCouples)
print "Erreur de test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

Erreur de test: 0.909580818424


## Factorisation de Matrices avec biais

In [11]:
class matrixFactorisationBiais():
    def __init__(self, k, lambd=0.2, eps=1e-5, maxIter=10000, alternate=0):
        self.k = k
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples):
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        self.mu = np.random.random() * 2 - 1
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k) * 2 - 1
                    self.bu[user] = np.random.rand() * 2 - 1
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1) * 2 - 1
                    self.bi[item] = np.random.rand() * 2 - 1
                tmp = dataUsers[user][item] - (self.mu + self.bi[item] + self.bu[user] + self.p[user].dot(self.q[item])[0][0])
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                    self.bu[user] = (1 - self.lambd * self.eps) * self.bu[user] + self.eps * 2 * tmp
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                    self.bi[item] = (1 - self.lambd * self.eps) * self.bi[item] + self.eps * 2 * tmp
                self.mu = (1 - self.lambd * self.eps) * self.mu + self.eps * 2 * tmp
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    optimP = optimQ
                    optimQ = 1 - optimQ
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mu + self.bu[c[0]] + self.bi[c[1]] + self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred

In [27]:
model4 = matrixFactorisationBiais(10, alternate=0)
model4.fit(trainUsers, trainItems, trainCouples)

0 7.84879153329
100 2.20841845373
200 1.78135161261
300 1.50665884754
400 1.36179780473
500 1.27271620043
600 1.18725411492
700 1.13627534893
800 1.08727524392
900 1.05073590425
1000 1.01654162391
1100 1.00560542752
1200 0.97646470343
1300 0.967124816417
1400 0.950623023452
1500 0.94110060945
1600 0.928282231649
1700 0.913576022519
1800 0.90368654631
1900 0.889919122869
2000 0.886632345112
2100 0.87637080938
2200 0.879517708617
2300 0.872767670014
2400 0.863631577612
2500 0.853651927641
2600 0.849095197608
2700 0.855620811393
2800 0.843541206755
2900 0.844477373661
3000 0.841569307323
3100 0.842851965166
3200 0.833419184501
3300 0.824305787357
3400 0.832219602661
3500 0.825525993355
3600 0.824484381541
3700 0.816336008193
3800 0.815159277787
3900 0.814879026143
4000 0.813612289994
4100 0.813574241683
4200 0.802972958589
4300 0.811059457145
4400 0.804668565967
4500 0.800480942454
4600 0.803039122674
4700 0.798049443498
4800 0.797483731818
4900 0.796329706711
5000 0.789174918466
5100 0.7

In [30]:
plt.figure()
plt.plot(model4.loss)
plt.show()

In [31]:
pred = model4.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

0.90047122114


# Biais Temporel

## Visualisition des notes en fonction du temps

In [18]:
def getTimeBins(couples, timedic, nbins):
    timestamps = np.zeros(len(couples))
    for i,c in enumerate(couples):
        timestamps[i] = timedic[c[0]][c[1]]
    time_bins = np.linspace(np.min(timestamps), np.max(timestamps), nbins+1)
    times = np.zeros(len(couples))
    for i in xrange(1,len(time_bins)):
        times = times + (timestamps > time_bins[i])
    return times

In [20]:
nbins = 5

times = getTimeBins(couples, timestamps, nbins)
ratings = np.array(np.array(couples)[:,2], float)
plt.figure()
for i in xrange(nbins):
    histi = np.bincount(np.array(ratings[times==i], int))
    plt.plot(1.* histi / histi.sum() , 'o-')
plt.show()
plt.close()

In [26]:
class matrixFactorisationBiaisTemporel():
    def __init__(self, k=10, ntimes=5, lambd=0.2, eps=1e-5, maxIter=10000, alternate=0):
        self.k = k
        self.ntimes = ntimes
        self.lambd = lambd
        self.eps = eps
        self.maxIter = maxIter
        self.alternate = alternate
    def fit(self, dataUsers, dataItems, couples, times):
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        self.mu = np.random.rand(self.ntimes) * 2 - 1
        self.loss = []
        optimP = True
        optimQ = (self.alternate == 0)
        for i in xrange(self.maxIter):
            loss = 0
            for j in xrange(len(couples)):
                r = np.random.randint(len(couples))
                user = couples[r][0]
                item = couples[r][1]
                time = times[r]
                if not user in self.p:
                    self.p[user] = np.random.rand(1,self.k) * 2 - 1
                    self.bu[user] = np.random.rand(self.ntimes) * 2 - 1
                if not item in self.q:
                    self.q[item] = np.random.rand(self.k,1) * 2 - 1
                    self.bi[item] = np.random.rand(self.ntimes) * 2 - 1
                tmp = dataUsers[user][item] - (self.mu[time] + self.bi[item][time] + self.bu[user][time] + self.p[user].dot(self.q[item])[0][0])
                if (optimP):
                    self.p[user] = (1 - self.lambd * self.eps) * self.p[user] + self.eps * 2 * tmp * self.q[item].transpose()
                    self.bu[user] = (1 - self.lambd * self.eps) * self.bu[user] + self.eps * 2 * tmp
                if (optimQ):
                    self.q[item] = (1 - self.lambd * self.eps) * self.q[item] + self.eps * 2 * tmp * self.p[user].transpose()
                    self.bi[item] = (1 - self.lambd * self.eps) * self.bi[item] + self.eps * 2 * tmp
                self.mu = (1 - self.lambd * self.eps) * self.mu + self.eps * 2 * tmp
                loss = loss + tmp*tmp #Sans régularisation
            self.loss.append(loss)
            if (self.alternate != 0):
                if (i % self.alternate == 0):
                    optimP = optimQ
                    optimQ = 1 - optimQ
                    print i, loss / len(couples)
            else:
                if (i % 100 == 0):
                    print i, loss / len(couples)
    def predict(self, couplesTest, times):
        pred = np.zeros(len(couplesTest))
        for ind,c in enumerate(couplesTest):
            pred[ind] = self.mu[times[ind]] + self.bu[c[0]][times[ind]] + self.bi[c[1]][times[ind]] + self.p[c[0]].dot(self.q[c[1]])[0][0]
        return pred


In [None]:
model5 = matrixFactorisationBiaisTemporel(10, alternate=0)
model5.fit(trainUsers, trainItems, trainCouples, times)

0 7.38218694634
100 2.40772634594
200 2.04236542863
300 1.82604969996
400 1.66328178972
500 1.54301808579
600 1.46656009831
700 1.41664113998
800 1.34577848033
900 1.31440313527
1000 1.27721477175
1100 1.23938703736
1200

# Base 1M

In [None]:
def loadMovieLens1M(path='./data1m'):
    # Get movie titles
    movies={}
    for line in open(path+'/movies.dat'):
        id,title=line.split('::')[0:2]
        movies[id]=title
    # Load data
    prefs={}
    times={}
    for line in open(path+'/ratings.dat'):
        (user,movieid,rating,ts)=line.split('::')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
        times.setdefault(user,{})
        times[user][movies[movieid]]=float(ts)
    return prefs, times

In [None]:
data, timestamps = loadMovieLens1M()

In [None]:
couples = getCouplesUsersItems(data)

trainCouples, testCouples = splitTrainTest(couples,.20)

trainUsers = buildUsersDict(trainCouples)
trainItems = buildItemsDict(trainCouples)

toDel = []

for i,c in enumerate(testCouples):
    if not c[0] in trainUsers:
        toDel.append(i)
    elif not c[1] in trainItems:
        toDel.append(i)

testCouples = np.delete(testCouples, toDel, 0)

testUsers  = buildUsersDict(testCouples)
testItems  = buildItemsDict(testCouples)

#print len(trainUsers), len(testUsers)
#print len(trainItems), len(testItems)

In [None]:
nbins = 5

times = getTimeBins(couples, timestamps, nbins)
ratings = np.array(np.array(couples)[:,2], float)
plt.figure()
for i in xrange(nbins):
    histi = np.bincount(np.array(ratings[times==i], int))
    plt.plot(1.* histi / histi.sum() , 'o-')
plt.show()
plt.close()

In [None]:
model6 = baselineMeanUsers()
model6.fit(trainUsers)
pred = model6.predict(testCouples)
print "erreur en test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()        

In [None]:
model7 = baselineMeanItems()
model7.fit(trainItems)
pred = model7.predict(testCouples)
print "erreur en test:", ((pred - np.array(testCouples[:,2], float)) ** 2).mean()        

In [None]:
model8 = matrixFactorisation(10, alternate=0, maxIter=1000)
model8.fit(trainUsers, trainItems, trainCouples)

In [None]:
plt.figure()
plt.plot(model8.loss)
plt.show()

In [None]:
pred = model8.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()

In [None]:
model9 = matrixFactorisationBiais(10, alternate=0)
model9.fit(trainUsers, trainItems, trainCouples)

In [None]:
plt.figure()
plt.plot(model9.loss)
plt.show()

In [None]:
pred = model9.predict(testCouples)
print ((pred - np.array(testCouples[:,2], float)) ** 2).mean()