In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from datetime import datetime
import os
from collections import defaultdict
import heapq
from operator import itemgetter
from math import sqrt
import pickle
import time

<h3 style = "color:blue"> Analyse Exploratoir </h3>

<p> 
    Les données que nous allons utliser dans ce etude provienne du site web GroupLens de l'univerisite de Minnesota, <a href="https://grouplens.org/datasets/movielens/"> que vous pouvez vister pour plus d'information</a>. Dans ce étude on a utilisé le dataset ml-100k.zip, juste a coté il y a le README qui donne plus d'information sur la donnée. Liser le pour comprendre le fichier. Les deux fichier qui nous interesse dans ce cas sont: <span style="color:red"> u.data </span> qui contient score des utilisation sur les film et <span style="color:red"> u.item </span> contenant les informations sur les films.

</p>

In [None]:
print(datetime.fromtimestamp(388474259))

In [None]:
def load_reviews(path, **kwargs):
    """ 
    On va charger les information des utilisation sur le films
    """
    # Redefinition des options
    options = {'fieldnames':('userid', 'movieid', 'rating', 'timestamp'),'delimiter':'\t'}
    options.update(kwargs)
    # changement de la representation du temps de timestamp a datetime
    parse_date = lambda r,k: datetime.fromtimestamp(float(r[k])) # convertion de timestamp
    # convertion des entier de string a int
    parse_int = lambda r,k: int(r[k])
    # lecture notre donnees sous forme de dictionnaire
    with open(path, 'r') as reviews:
        reader = csv.DictReader(reviews, **options)
        for row in reader:
            row['userid'] = parse_int(row, 'userid')
            row['movieid'] = parse_int(row, 'movieid')
            row['rating'] = parse_int(row, 'rating')
            row['timestamp'] = parse_date(row,'timestamp')
            yield row
    
def relative_path(path):
    """
    Cette fonction nous permettra d'avoir absolue du fichier de ce code. ainsi on
    va l'utiliser pour charger notre donne
    """
    direname = os.path.dirname(os.path.realpath('__file__'))
    direname = '/'.join(direname.split('/')[:-1])
    path = os.path.join(direname, path)
    return os.path.normpath(path)

def load_movied(path, **kwargs):
    """
    Loads MovieLens movies
    """
    options = {
            'fieldnames': ('movieid', 'title', 'release',
                        'video', 'url'),'delimiter': '|','restkey': 'genre',
            
    }
    
    options.update(kwargs)
    parse_int = lambda r,k: int(r[k])
    parse_date = lambda r,k: datetime.strptime(r[k], '%d-%b-%Y') if r[k] else None
    
    with open(path, 'r', encoding='latin-1') as movies:
        reader = csv.DictReader(movies, **options)
        for row in reader:
            row['movieid'] = parse_int(row, 'movieid')
            row['release'] = parse_date(row, 'release')
            row['video']   = parse_date(row, 'video')
            yield row
            
class MovieLens(object):
    """
    Structure de données pour construre notre systeme de recommandation
    """
    def __init__(self, udata, uitem):
        self.udata = udata
        self.uitem = uitem
        self.movies ={}
        self.reviews = defaultdict(dict)
        self.load_dataset()
        
    def load_dataset(self):
        """
        Chargement de données en memoire
        """
        for movie in load_movied(self.uitem):
            self.movies[movie['movieid']] = movie
        for review in load_reviews(self.udata):
            self.reviews[review['userid']][review['movieid']] = review
            
    def reviews_for_movie(self, movieid):
        for review in self.reviews.values():
            if movieid in review:
                yield review[movieid]
                
    def average_reviews(self):
        for movieid in self.movies:
            reviews = list(r["rating"] for r in self.reviews_for_movie(movieid))
            average = sum(reviews)/float(len(reviews))
            yield (movieid, average, len(reviews)) 
            
    def top_rated(self, n=10):
        """
        renvoi les 10 premier film qui ont un score elevé
        """
       # return heapq.nlargest(n, self.average_reviews(), key = itemgetter(1))
        return heapq.nlargest(n, self.bayesian_average(), key = itemgetter(1))
    
    def bayesian_average(self, c=59, m=3):
        for movieid in self.movies:
            reviews = list(r["rating"] for r in self.reviews_for_movie(movieid))
            average = (c*m + sum(reviews))/float(c+len(reviews))
            yield (movieid, average, len(reviews)) 
            
    def shared_preference(self, criticsA, criticsB):
        
        if criticsA not in self.reviews:
            raise KeyError("Couldn't find critics {} in data".format(criticsA))
        if criticsB not in self.reviews:
            raise KeyError("Couldn't find critics {} in data".format(criticsB))
        
        moviesA = set(self.reviews[criticsA].keys())
        moviesB = set(self.reviews[criticsB].keys())
        shared  = moviesA & moviesB
        
        reviews = {}
        for movieid in shared:
            reviews[movieid] = (
                self.reviews[criticsA][movieid]['rating'],
                self.reviews[criticsB][movieid]['rating']
            )
        return reviews
                
    def euclidean_distance(self, criticsA, criticsB):
                preferences = self.shared_preference(criticsA, criticsB)
                if len(preferences) == 0 : return 0
                sum_of_squares = sum([pow(a-b,2) for a,b in preferences.values()])
                return 1/(1+sqrt(sum_of_squares))
    def pearson_correlation(self, criticsA, criticsB, prefs = "user"):
        
        preference = {
            "user":self.shared_preference,
            "movies": self.shared_critics
        }
        shared_ = preference.get(prefs, None)
        preferences = shared_(criticsA, criticsB)
        
        lenght = len(preferences)
        if lenght == 0 : return 0
        
        sumA = sumB = sumSquaresA = sumSquaresB = sumProducts = 0
        
        for a, b in preferences.values():
            sumA+=a
            sumB+=b
            sumSquaresA+= pow(a, 2)
            sumSquaresB+= pow(b, 2)
            sumProducts+= a*b
            
        numerator = (sumProducts*lenght) - (sumA*sumB)
        denominator = sqrt(((sumSquaresA*lenght) - pow(sumA, 2))* ((sumSquaresB*lenght) - pow(sumB, 2)))
        
        if denominator == 0 : return 0
        return abs(numerator/denominator)
    
    def similar_critics(self, user, metric = 'euclidean' , n = None):
        
        metrics = {
            'euclidean': self.euclidean_distance,
            'pearson' : self.pearson_correlation
        }
        
        distance = metrics.get(metric, None)
        
        if user not in self.reviews:
            raise KeyError("Unknown user, {}".format(user))
        if not distance or not  callable(distance):
            raise KeyError("Unknown or unprogrammed distance metric {}".format(metric))
            
        critics = {}
        for critic in self.reviews:
            if critic == user:
                continue
            critics[critic] = distance(user, critic)
        
        if n:
            return heapq.nlargest(n, critics.items(), key = itemgetter(1))
        
        return critics
    
    def predict_raking(self, user, movie, metric = "euclidean",  critics = None):
        
        critics = critics or self.similar_critics(user, metric = metric)
        total = 0
        simsum = 0
        
        for critic, similarity in critics.items():
            if movie in self.reviews[critic]:
                total+= similarity * self.reviews[critic][movie]['rating']
                simsum+=similarity
        if simsum == 0.0: return 0.0
        return total/simsum
    
    def predict_all_rankings(self, user, metric = "euclidean", n = None):
        
        critics = self.similar_critics(user, metric = metric)
        movies = {
            movie : self.predict_rating(user, movie, metric, critics) for movie in self.movies
        }
        
        if n:
            return heapq.nlargest(n, movies.items(), key = itemgetter(1))
        return movies
    def shared_critics(self, movieA, movieB):
        
        if movieA not in self.movies:
            raise KeyError("could not find {} in data".format(movieA))
        if movieB not in self.movies:
            raise KeyError("could not find {} in data".format(movieB))
            
        criticsA = set(critic for critic in self.reviews if movieA in self.reviews[critic])
        criticsB = set(critic for critic in self.reviews if movieB in self.reviews[critic])
        
        shared = criticsA&criticsB
        
        reviews = {}
        for critic in shared :
            reviews[critic] = (
                self.reviews[critic][movieA]["rating"],
                self.reviews[critic][movieB]["rating"]
            )
        return reviews
    
    def similar_items(self, movie, metric = "euclidean", n=None):
        
        metrics = {
            'euclidean': self.euclidean_distance,
            'pearson': self.pearson_correlation
        }
        
        distance = metrics.get(metric, None)
        
        if movie not in self.reviews:
            raise KeyError("Unknown movie {}".format(movie))
        if not distance or not callable(distance):
            raise KeyError("Unknow or not programmed distance {}".format(distance))
            
        items = {}
        for item in self.movies:
            if item == movie:
                continue
            items[item] = distance(item, movie, prefs = 'movies')
        if n:
            return heapq.nlargest(n, items.items, key = itemgetter(1))
        return items
    
    def predict_ranking_(self, user, movie, metric = "euclidean"):
        movies = self.similar_items(movie, metric=metric)
        total = 0
        simsum = 0
        
        for relmovie, similarity in movies.items():
            if relmovie in self.reviews[user]:
                total+= similarity*self.reviews[user][relmovie]['rating']
                simsum+=similarity
        if simsum == 0.0 : return 0.0
        return total/simsum
        
        
class Recommender(object):
    
    @classmethod
    def load(k_class, pickle_path):
        
        with open(pickle_path, 'rb') as pkl:
            return pickle.load(pkl)
    
    def __init__(self, udata, description = None):
        self.udata = udata
        self.users = None
        self.reviews = None
        self.movies = None
        
        self.build_start = None
        self.build_finish = None
        self.description = None
        
        self.model = None
        self.features = 2
        self.steps = 5000
        self.alpha = 0.0002
        self.beta = 0.02
        
        self.load_dataset()

    def dump(self, pickle_path):
        
        with open( pickle_path, 'rb') as pkl:
            pickle.dump(self, pkl)
    def load_dataset(self):
            
            self.users = set([])
            self.movies = set([])
        
            for review in load_reviews(self.udata):
            
                self.users.add(review['userid'])
                self.movies.add(review['movieid'])
            
            self.users = sorted(self.users)
            self.movies = sorted(self.movies)
            self.reviews = np.zeros(shape = (len(self.users), len(self.movies)))
        
            for review in load_reviews(self.udata):
                uid = self.users.index(review['userid'])
                mid = self.movies.index(review['movieid'])
                self.reviews[uid, mid] = review['rating']
            
    def sparsity(self):
                
        return 1 - self.density()
            
    def density(self):
                
        nonzero = float(np.count_nonzero(self.reviews))
        return nonzero/self.reviews.size
    
    def build(self, output = None):
        
        options = {
            'K':self.features,
            'steps':self.steps,
            'alpha':self.alpha,
            'beta':self.beta
        }
        
        self.build_start = time.time()
        self.P, self.Q = factor(self.reviews, **options)
        self.mfModel = np.dot(self.P, self.Q.T)
        self.build_finish = time.time()
        
        if output:
            self.dump(output) 
    def predict_ranking(self, user, movie):
        
        uidx  =  self.users.index(user)
        midx  = self.movies.index(movie)
        
        if self.reviews[uidx, midx] > 0:
            return None
        return self.model[uidx, midx]
        
    def top_rated(self, user, n = 12):
        
        movies = [(mid, self.predict_ranking(user, mid)) for mid in self.movies ]
    

def initialize(R, K):
    
    N, M = R.shape
    P   = np.random.rand(N, K)
    Q   = np.random.rand(M, K)
    
    return P, Q

def factor(R, P=None, Q=None, K = 2, steps = 5000, alpha = 0.0002, beta = 0.02):
    
    
    if not P and not Q :
        P, Q = initialize(R, K)
    
    Q = Q.T
    
    rows, cols = R.shape
    for step in range(steps):
        for i in range(rows):
            for j in range(cols):
                if R[i,j] > 0:
                    eij = R[i,j] - np.dot(P[i, :], Q[:,j])
                    for k in range(K):
                        P[i,k] = P[i,k] + alpha * (2*eij*Q[k,j] - beta*P[i,k])
                        Q[k,j] = Q[k,j] + alpha * (2*eij*P[i,k] - beta*Q[k,j])
        e = 0
        for i in range(rows):
            for j in range(cols):
                 if R[i,j] > 0:
                    e = e + pow(R[i,j] - np.dot(P[i,:], Q[:,j]),2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i,k], 2) + pow(Q[k, j], 2))
        if e < 0.001:
                break
    return P, Q.T
    
data  = relative_path('Data/ml-100k/u.data')
item  = relative_path('Data/ml-100k/u.item')
model = MovieLens(data, item)
mfModel = Recommender(data)         
mfModel.build('reccord.pickle')

<h3 style = "color:blue"> Recherche le filme qui a plus de score </h3>

In [None]:

#for mid, avg, num in model.top_rated(20):
    #title =model.movies[mid]["title"]
    #print("[%0.3f averagae rating (%i reviews)] %s" %( avg, num, title))

In [None]:
#for mid, avg, num in model.top_rated(20):
    #title =model.movies[mid]["title"]
    #print("[%0.3f averagae rating (%i reviews)] %s" %( avg, num, title))

In [None]:
print(model.euclidean_distance(532, 232))

In [None]:
print(model.pearson_correlation(532, 232))

In [None]:
#for item in model.similar_critics(232, metric="euclidean", n = 10):
    #print ( "%4i: %0.3f" % item)

In [None]:
#for item in model.similar_critics(232, metric="pearson", n = 10):
    #print ( "%4i: %0.3f" % item)

In [None]:
print ( model.predict_rating(422, 50 , metric = "euclidean"))

In [None]:
print( model.predict_rating(422, 50, metric = "pearson"))

In [None]:
#for mid, rating in model.predict_all_rankings(578, 'pearson', 10) :
    #print("%0.3f  %s" %(rating, model.movies[mid]['title']))

In [None]:
#model.shared_preference(232, 532)

In [None]:
#for movie, simillarity in model.similar_items(631, 'pearson').items():
    #print("%0.3f :%s" % (simillarity, model.movies[movie]["title"]))

In [None]:
print(model.predict_ranking_(232, 52, 'pearson'))

In [None]:
print("{} spare".format(mfModel.sparsity()))
print("%0.3f dense" % mfModel.density())

In [None]:
rec = Recommender.load('reccorr.pickle')
for item in rec.top_rated(234):
    print("%i: %0.3f" %item)