# TME1 - Recommendation Sociale

## Imports

In [1]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pickle as pkl

## Chargement des données

La fonctions pour charger la base Movie Lens 100k.
On récupère:
- Les scores: Une liste de triplet [user, movie, rating]
- Les liens: Une liste de triplet [source, target, weight]

In [2]:
def loadMovieLens(path='./data', dataFile='/u.data', dataFileTest=None):
    ratings = []
    ratings_test = []
    links = []
    users = set()
    movies = set()
    for line in open(path+dataFile):
        (userId,movieId,rating,ts)=line.split('\t')
        ratings.append([userId, movieId, float(rating)/5])
        users.add(userId)
        movies.add(movieId)
    for line in open(path+'/u.links'):
        l = line[:-1].split('\t')
        source = l[0]
        if source in users:
            for target in l[1:]:
                if target in users:
                    links.append([source, target, 1])
    if (dataFileTest):
        for line in open(path+dataFileTest):
            (userId,movieId,rating,ts)=line.split('\t')
            if (userId in users) and (movieId in movies):
                ratings_test.append([userId, movieId, float(rating)/5])
    return links, ratings, ratings_test

## Factorisation matricielle

In [3]:
class matrixFactorisation():
    def __init__(self, k, lambdaC=0.2, lambdaU=0.2, lambdaV=0.2, lambdaZ=0.2, eps=1e-5, maxIter=2000):
        self.k = k
        self.lambdaC = lambdaC
        self.lambdaU = lambdaU
        self.lambdaV = lambdaV
        self.lambdaZ = lambdaZ
        self.eps = eps
        self.maxIter = maxIter
    def fit(self, tripletsUsersItems, tripletsLinks):
        self.u = {}
        self.v = {}
        self.z = {}
        self.loss = []
        #Choix du paramètre a optimisé en cas d'optimisation alternée
        for i in xrange(self.maxIter):
            lossUV = 0
            lossUZ = 0
            lossReg = 0
            for j in xrange(len(tripletsUsersItems)):
                # Ratings --------------------------------------------------------------------------------------------
                r = np.random.randint(len(tripletsUsersItems)) 
                user =   tripletsUsersItems[r][0]
                item =   tripletsUsersItems[r][1]
                rating = tripletsUsersItems[r][2]
                if not user in self.u:
                    self.u[user] = np.random.rand(1,self.k)
                if not item in self.v:
                    self.v[item] = np.random.rand(self.k,1)
                expUV = np.exp(self.u[user].dot(self.v[item])[0][0])
                logistiqueUV = (1.0/(1 + expUV))
                tmp = logistiqueUV - rating
                self.u[user] = self.u[user] - self.eps * tmp * expUV * (logistiqueUV **2) * self.v[item].transpose()
                self.v[item] = self.v[item] - self.eps * tmp * expUV * (logistiqueUV **2) * self.u[user].transpose()
                lossUV = lossUV + tmp*tmp/2. 
                # Links ---------------------------------------------------------------------------------------------
                r = np.random.randint(len(tripletsLinks))
                userSource = tripletsLinks[r][0]
                userTarget = tripletsLinks[r][1]
                linkScore  = tripletsLinks[r][2]
                if not userSource in self.u:
                    self.u[userSource] = np.random.rand(1,self.k)
                if not userTarget in self.z:
                    self.z[userTarget] = np.random.rand(self.k,1)
                expUZ = np.exp(self.u[userSource].dot(self.z[userTarget])[0][0])
                logistiqueUZ = (1.0/(1 + expUZ))
                tmp = logistiqueUZ - linkScore
                self.u[userSource] = self.u[userSource] - self.eps * tmp * expUZ * (logistiqueUZ **2) * self.z[userTarget].transpose()
                self.z[userTarget] = self.z[userTarget] - self.eps * tmp * expUZ * (logistiqueUZ **2) * self.u[userSource].transpose()
                lossUZ = lossUZ + tmp*tmp/2. 
                # Regularize  --------------------------------------------------------------------------------------
                ru = np.random.choice(self.u.keys());
                rv = np.random.choice(self.v.keys());
                rz = np.random.choice(self.z.keys());
                self.u[ru] = self.u[ru] * (1 - self.lambdaU * self.eps)
                self.v[rv] = self.v[rv] * (1 - self.lambdaV * self.eps)
                self.z[rz] = self.z[rz] * (1 - self.lambdaZ * self.eps)
                lossReg = lossReg + np.sqrt((self.u[ru]**2).sum()) + np.sqrt((self.v[rv]**2).sum()) + np.sqrt((self.z[rz]**2).sum())
            self.loss.append([lossUV, lossUZ, lossReg])
            if (i % 1 == 0):
                print i, (lossUV + lossUZ + lossReg) / len(tripletsUsersItems)
    def predict(self, tripletsUsersItems):
        pred = np.zeros(len(tripletsUsersItems))
        for ind,c in enumerate(tripletsUsersItems):
            pred[ind] = self.u[c[0]].dot(self.v[c[1]])[0][0]
        return pred

# Tests les données Movie Lens 100k

Les données Movie Lens 100k comprennent 100 000 scores données par 1000 utilisateurs sur 1700 films.

## Préparation des données

On extrait aléatoirement une portion (20%) des données pour constituer la base de test, et le reste sera utilisé en apprentissage.

Comme on ne souhaite ne pas évaluer les objets et les utilisateurs qui n'ont jamais été rencontré en apprentissage, on retire les couples correspondants de l'ensemble de test.

Reste ensuite à reconstruire les deux dictionnaires a partir de ces liste de couples.

In [4]:
# Chargement
links, ratings_train, ratings_test = loadMovieLens(dataFile="/u1.train", dataFileTest="/u1.test")

In [5]:
model = matrixFactorisation(5, eps=1e-2, maxIter=50)
model.fit(ratings_train, links)

0 4.13368725017
1 3.95543584301
2 3.80071924069
3 3.65883941656
4 3.53270103102
5 3.42076618068
6 3.30667844923
7 3.22176187356
8 3.1391312072
9 3.06442500201
10 2.99704559005
11 2.94184376321
12 2.88950101126
13 2.8412886297
14 2.79295786529
15 2.7596006539
16 2.72443339659
17 2.69199660864
18 2.66311779102
19 2.6402213719
20 2.61699512739
21 2.59184691913
22 2.57058396232
23 2.55288723649
24 2.53631473833
25 2.52100451029
26 2.50895046003
27 2.50073438692
28 2.48680951522
29 2.47118760602
30 2.4600269524
31 2.45950511349
32 2.45165461334
33 2.44605642771
34 2.43963941205
35 2.43245932583
36 2.42224783115
37 2.42499054187
38 2.42305965261
39 2.41358347574
40 2.40989835035
41 2.40572543299
42 2.40156826487
43 2.39336146403
44 2.39260156864
45 2.39841615518
46 2.38834056621
47 2.38736924199
48 2.38839322096
49 2.38818461937


In [6]:
pred = model.predict(ratings_test)
print "Erreur de test:", ((pred - np.array(np.array(ratings_test)[:,2], float)) ** 2).mean()

Erreur de test: 0.602970871829
