### Import des librairies et lecture des fichiers


Est ce qu'on ajouterais pas les keywords ?

In [196]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re
import time

In [197]:
movies = pd.read_csv("movies_metadata.csv")
ratings = pd.read_csv("ratings_small.csv")
keywords = pd.read_csv("keywords.csv")

### Nettoyage de la base de données et réduction de la matrice aux caractéristiques interéssantes

Suppression des id incorrects, des valeurs abérrantes, des lignes avec NaN, et modification des valeurs pour les rendre plus faciles à traiter.

On sélectionne les attributs de films qui semblent pertinents pour différencier les films sur leur contenu.
Ces choix sont arbitraires et on pourra être amenés à réfléchir dessus et à les modifier. Nous aurions voulu compléter notre base de données pour obtenir plus d'attributs (en particulier réalisateur, casting, mots clés), mais nous avons finalement du nous résigner à ne pas garder trop d'attributs pour limiter le temps de calcul.

In [198]:
def filter_correct_id(word): # MEILLEURE VERSION ICI
    if not isinstance(word, str) or re.fullmatch(r'[0-9]+', word):
        return word
    return "wrong_id"

In [199]:
movies = movies[~movies.id.duplicated()]
keywords = keywords[~keywords.id.duplicated()]
movies.id = movies.id.apply(filter_correct_id)
movies = movies[movies.id != "wrong_id"]
movies.id = movies.id.astype('int64')
movies.budget = movies.budget.astype('int64')

Nous ne voulons garder que les films ayant reçu une note. Cela est une manière de ne garder qu'un nombre limité de films (il est très compliqué pour nous d'effectuer des calculs pour 45000 films) et cela pourrait être utile dans le cadre de la recommendation "user-based".

In [200]:
movies = movies.rename(columns={'id' : 'movieId'})
keywords = keywords.rename(columns={'id' : 'movieId'})
ratings = ratings.movieId.drop_duplicates()
#keywords = keywords[~movies.movieId.duplicated()]
keywords = keywords.drop_duplicates()
movies = movies.merge(ratings, how='inner')
movies = movies.join(keywords.set_index('movieId'),on='movieId',how='inner')

In [201]:
len(movies)

2830

In [202]:
keywords.head()

Unnamed: 0,movieId,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [203]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'movieId', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords'],
      dtype='object')

In [204]:
cluster_features = movies[['movieId', 'genres','keywords', 'release_date', 'production_countries', 'original_language', 'runtime', 'budget']]
cluster_features = cluster_features.dropna()

In [205]:
cluster_features.head()

Unnamed: 0,movieId,genres,keywords,release_date,production_countries,original_language,runtime,budget
0,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",1995-12-15,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,170.0,60000000
1,710,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...",1995-11-16,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,130.0,58000000
2,1408,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 911, 'name': 'exotic island'}, {'id': ...",1995-12-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,119.0,98000000
3,524,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...","[{'id': 383, 'name': 'poker'}, {'id': 726, 'na...",1995-11-22,"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",en,178.0,52000000
4,4584,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'id': 420, 'name': 'bowling'}, {'id': 818, '...",1995-12-13,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",en,136.0,16500000


In [206]:
print(len(cluster_features[cluster_features["runtime"]==0]))  #22 films à 0 minutes
#La durée d'un long métrage est d'au moins 40 minutes, ne gardons que les longs (et cela supprime les valeurs abérrantes) 
cluster_features = cluster_features[cluster_features["runtime"]>40]

22


In [207]:
print(len(cluster_features))
print(len(cluster_features[cluster_features["budget"]==0])) #beaucoup de films dont le budget n'est pas renseigné

#J'aimerais savoir combien de films n'ont pas de genre ou pas de keywords mais je n'y arrive pas :(

2765
1507


In [208]:
def vectorize_genres(genres):
    if isinstance(genres, str):
        pattern = re.compile(r"'id': [0-9]*")
        return np.array([int(w[6:]) for w in pattern.findall(genres)])
    return genres

cluster_features.genres = cluster_features.genres.apply(vectorize_genres)

In [209]:
def vectorize_keywords(keywords):
    if isinstance(keywords, str):
        pattern = re.compile(r"'id': [0-9]*")
        return np.array([int(w[6:]) for w in pattern.findall(keywords)])
    return keywords

cluster_features.keywords = cluster_features.keywords.apply(vectorize_keywords)

In [210]:
def simplify_date(date):
    if isinstance(date, str):
        return int(date[:4])
    return date

cluster_features.release_date = cluster_features.release_date.apply(simplify_date)

In [211]:
def simplify_countries(countries):
    if isinstance(countries, str):
        pattern = re.compile(r"'iso_3166_1': ...")
        return [w[15:] for w in pattern.findall(countries)]
    return genres

cluster_features.production_countries = cluster_features.production_countries.apply(simplify_countries)

In [212]:
cluster_features.head()

Unnamed: 0,movieId,genres,keywords,release_date,production_countries,original_language,runtime,budget
0,949,"[28, 80, 18, 53]","[642, 703, 974, 1523, 3713, 7281, 9727, 9812, ...",1995,[US],en,170.0,60000000
1,710,"[12, 28, 53]","[701, 769, 1308, 2812, 3268, 3272, 3278, 3376,...",1995,"[GB, US]",en,130.0,58000000
2,1408,"[28, 12]","[911, 1454, 1969, 3799, 5470, 12988]",1995,"[FR, DE, IT, US]",en,119.0,98000000
3,524,"[18, 80]","[383, 726, 1228, 2635, 33625]",1995,"[FR, US]",en,178.0,52000000
4,4584,"[18, 10749]","[420, 818, 964, 2755, 7564, 10911, 11109, 1506...",1995,"[GB, US]",en,136.0,16500000


In [213]:
cluster_features.tail()

Unnamed: 0,movieId,genres,keywords,release_date,production_countries,original_language,runtime,budget
2823,2331,"[36, 18]","[186, 2292, 2974, 3034, 3036, 3902, 6506, 1100...",1999,"[CZ, DE, IT, US]",en,240.0,20000000
2825,80831,[18],[],2009,[ID],en,121.0,0
2826,3104,"[27, 878]","[6737, 155730]",1967,[GB],en,92.0,0
2827,64197,"[10749, 18]",[187056],2007,[RU],ru,97.0,0
2828,98604,"[35, 10749]",[],2012,[RU],ru,91.0,0


In [214]:
len(cluster_features)

2765

### Définition d'une distance sur les films

In [215]:
MAX_YEAR = max(cluster_features.release_date) - min(cluster_features.release_date)

In [216]:
cluster_features.runtime.describe()

count    2765.000000
mean      107.477758
std        24.060719
min        43.000000
25%        93.000000
50%       103.000000
75%       118.000000
max       320.000000
Name: runtime, dtype: float64

In [217]:
cluster_features.budget.describe()

count    2.765000e+03
mean     1.427089e+07
std      3.146838e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.400000e+07
max      3.800000e+08
Name: budget, dtype: float64

Cette fonction calcule les similarités entre 2 Films. Plus les valeurs sont proches de 0, plus les films ont de points communs et plus elles sont proches de 1, plus ils sont différents (peut-être changer le nom ? instinctivement, j'aurais pensé que pour être très similaire il fallait un score élevé)

discuter de la manière de les calculer ?
mettre formule pour dire d'où viennent les std pris

In [220]:
def similarity_vect(mov1, mov2):
    m1 = mov1.iloc[0]
    m2 = mov2.iloc[0]
    simi_vect = pd.Series([])
    
    # SIMILARITIES IN GENRES
    
    g1 = m1.genres
    g2 = m2.genres
    if list(g1) and list(g2):
        g3 = np.append(g1, g2)
        simi_vect['genres'] = 2 * len(np.unique(g3)) / len(g3) - 1 # CHANGER PEUT ETRE
    else:
        simi_vect['genres'] = 1 #si un film n'a pas de genre donné, la distance est maximisée
        
        # SIMILARITIES IN KEYWORDS
    
    k1 = m1.keywords
    k2 = m2.keywords
    if list(k1) and list(k2):
        k3 = np.append(k1, k2)
        simi_vect['keywords'] = 2 * len(np.unique(k3)) / len(k3) - 1 # CHANGER PEUT ETRE
    else:
        simi_vect['keywords'] = 1
        
    #SIMILARITIES FOR THE RELEASE DATE
    
    simi_vect['release_date'] = abs(m1.release_date - m2.release_date) / MAX_YEAR #
    
    #SIMILARITIES IN PRODUCTION COUNTRIES
    
    p1 = m1.production_countries
    p2 = m2.production_countries
    if list(p1) and list(p2):
        p3 = []
        p3.extend(p1)
        p3.extend(p2)
        simi_vect['production_countries'] = 2 * len(np.unique(p3)) / len(p3) - 1 # Changer aussi peut etre
    else:
        simi_vect['production_countries'] = 1
    
    #SIMILARITIES FOR THE LANGUAGE
    
    simi_vect['original_language'] = int(not m1.original_language == m2.original_language)
    
    #SIMILARITIES FOR THE RUNTIME
    
    r1 = m1.runtime
    r2 = m2.runtime
    #j'ai supprimé les films trop courts comme il y en avait très peu
    #if r1 == 0 or r2 == 0:
    #    simi_vect['runtime'] = 1 # Changer peut etre
    #else:
    simi_vect['runtime'] = min(1, abs(r1 - r2) / 55) # Choix 2 * std
    
    #SIMILARITIES FOR THE BUDGET
    
    b1 = m1.budget
    b2 = m2.budget
    if b1 == 0 or b2 == 0:
        simi_vect['budget'] = 1 # Changer peut etre
    else:
        simi_vect['budget'] = min(1, abs(b1 - b2) / (3 * 1.1e+07)) # La aussi (3 * ecart type ~ arbitraire)
    
    return simi_vect

In [222]:
mov1 = cluster_features[cluster_features.movieId == 2331]
mov2 = cluster_features[cluster_features.movieId == 524]
#print(mov1)
print(mov1.iloc[0])
print(mov2.iloc[0])

similarity_vect(mov1,mov2)

#pourquoi runtime à 1 ici ?

movieId                                                              2331
genres                                                           [36, 18]
keywords                [186, 2292, 2974, 3034, 3036, 3902, 6506, 1100...
release_date                                                         1999
production_countries                                     [CZ, DE, IT, US]
original_language                                                      en
runtime                                                               240
budget                                                           20000000
Name: 2823, dtype: object
movieId                                           524
genres                                       [18, 80]
keywords                [383, 726, 1228, 2635, 33625]
release_date                                     1995
production_countries                         [FR, US]
original_language                                  en
runtime                                           178
budg

genres                  0.500000
keywords                1.000000
release_date            0.039604
production_countries    0.666667
original_language       0.000000
runtime                 1.000000
budget                  0.969697
dtype: float64

Cette fonction sert à donner des poids à chaque caractéristique.

In [223]:
def weight_vect(simi_vect, w_gen=3, w_key=3, w_rel=2, w_pro=1, w_ori=1, w_run=1, w_bud=1):
    weighted_vect = simi_vect.copy()
    weighted_vect.genres *= w_gen
    weighted_vect.keywords *= w_key
    weighted_vect.release_date *= w_rel
    weighted_vect.production_countries *= w_pro
    weighted_vect.original_language *= w_ori
    weighted_vect.runtime *= w_run
    weighted_vect.budget *= w_bud
    
    return weighted_vect

In [224]:
def movie_distance(m1, m2, distance=lambda x: np.linalg.norm(x, ord=1)):
    simi_vect = similarity_vect(m1, m2)
    weighted_vect = weight_vect(simi_vect)
    return distance(weighted_vect)

In [225]:
movie_distance(mov1,mov2)

7.215571557155716

In [228]:
def compute_dist_matrix(clu_fea):
    dist = []
    movies_id = clu_fea.movieId.unique() #unique() à retirer ? je pense qu'il n'est pas utile ?
    couples = []
    for i in range(len(movies_id)):
        u = movies_id[i]
        for j in range(i + 1, len(movies_id)):
            v = movies_id[j]
            couples.append((str(u), str(v)))
            dist.append(movie_distance(clu_fea[clu_fea.movieId == u], clu_fea[clu_fea.movieId == v]))
    
    index = pd.MultiIndex.from_tuples(couples, names=['u', 'v'])
    dist_mat = pd.Series(dist, index=index)
    
    return dist_mat

In [233]:
mini_test = cluster_features.sample(10)
petit_test = cluster_features.sample(100)

In [231]:
#start_time = time.time()
#m = compute_dist_matrix(cluster_features) # environ 1 heure
#print(m)
#print("Temps d execution : %s secondes ---" % (time.time() - start_time))

In [234]:
start_time = time.time()
ma = compute_dist_matrix(mini_test)
print(ma)
print("Temps d execution : %s secondes ---" % (time.time() - start_time))

u      v    
2055   80219    7.636364
       4932     8.621962
       766      7.551155
       26663    8.890729
       844      8.504650
       2694     9.841224
       2290     7.924662
       1499     7.343474
       3602     7.913951
80219  4932     7.985599
       766      7.636004
       26663    6.254365
       844      8.474347
       2694     8.004860
       2290     6.670117
       1499     7.676808
       3602     7.277588
4932   766      6.440504
       26663    7.523312
       844      9.459946
       2694     8.699550
       2290     7.497300
       1499     7.225458
       3602     7.826823
766    26663    8.163816
       844      8.283511
       2694     9.549955
       2290     7.347705
       1499     5.334690
       3602     5.977228
26663  844      9.528713
       2694     8.668317
       2290     7.466067
       1499     8.244224
       3602     8.241044
844    2694     6.591985
       2290     8.155503
       1499     7.604835
       3602     9.712331
2694   2290 

In [235]:
start_time = time.time()
mat = compute_dist_matrix(petit_test)
print(mat)
print("Temps d execution : %s secondes ---" % (time.time() - start_time))

u      v    
1665   4645      9.482088
       43177     7.097210
       1694      7.577858
       6547      9.602880
       8392      9.518452
                  ...    
36527  8986      8.236004
       563       8.677228
8420   8986      8.488839
       563      10.430063
8986   563       7.441224
Length: 4950, dtype: float64
Temps d execution : 82.81663656234741 secondes ---


In [39]:
class dendrogram:
    def __init__(self, leaf=None):
        self.leaf = leaf
        self.leaf_nb = 1
        self.father = None
        self.left = None
        self.right = None
    
    def set_leaf_nb(self):
        total_leaf_nb = 0
        if self.left is not None:
            total_leaf_nb += self.left.leaf_nb
        if self.right is not None:
            total_leaf_nb += self.right.leaf_nb
        self.leaf_nb = max(1, total_leaf_nb)
    
    def get_id_list(self):
        id_list = []
        def prefix(node):
            if node.leaf is not None:
                id_list.append(node.leaf)
            else:
                prefix(node.right)
                prefix(node.left)
        
        prefix(self)
        return id_list
    
    def get_root(self):
        tmp = self
        while tmp.father is not None: tmp = tmp.father
        return tmp

In [40]:
cluster_features["dendrogram"] = cluster_features.movieId.apply(lambda x: dendrogram(leaf=x))

In [41]:
def weighted_mean(x1, w1, x2, w2):
    return (w1 * x1 + w2 * x2) / (w1 + w2)

In [42]:
def clusterize(dist_mat, clu_fea):
    
    size_mat = clu_fea.shape[0]
    for cpt in range(1, size_mat):
        id_list = [tup[1] for tup in dist_mat.index[: size_mat - cpt]]
        id_list.append(dist_mat.index[0][0])
        index_str1, index_str2 = pd.Series.idxmin(dist_mat)
        index1, index2 = int(index_str1), int(index_str2)
        mov1 = clu_fea[clu_fea.movieId == index1].iloc[0]
        mov2 = clu_fea[clu_fea.movieId == index2].iloc[0]
        tmp1 = mov1.dendrogram
        tmp2 = mov2.dendrogram
        while tmp1.father is not None: tmp1 = tmp1.father
        while tmp2.father is not None: tmp2 = tmp2.father
        tmp3 = dendrogram()
        tmp3.left = tmp1
        tmp3.right = tmp2
        tmp3.set_leaf_nb()
        tmp1.father = tmp3
        tmp2.father = tmp3
        for x in id_list:
            if x == index_str1 or x == index_str2 : continue
            if (index_str1, x) in dist_mat.index:
                if (index_str2, x) in dist_mat:
                    tmp = weighted_mean(dist_mat.loc[index_str1, x], tmp1.leaf_nb,
                                        dist_mat.loc[index_str2, x], tmp2.leaf_nb)
                else:
                    tmp = weighted_mean(dist_mat.loc[index_str1, x], tmp1.leaf_nb,
                                        dist_mat.loc[x, index_str2], tmp2.leaf_nb)
                dist_mat.loc[index_str1, x] = tmp
            else:
                if (index_str2, x) in dist_mat:
                    tmp = weighted_mean(dist_mat.loc[x, index_str1], tmp1.leaf_nb,
                                        dist_mat.loc[index_str2, x], tmp2.leaf_nb)
                else:
                    tmp = weighted_mean(dist_mat.loc[x, index_str1], tmp1.leaf_nb,
                                        dist_mat.loc[x, index_str2], tmp2.leaf_nb)
                dist_mat.loc[x, index_str1] = tmp
        dist_mat = dist_mat.drop(index_str2, level='u')
        dist_mat = dist_mat.drop(index_str2, level='v')
    
    return clu_fea.iloc[0].dendrogram.get_root()

In [43]:
clu_test = cluster_features.iloc[:10]
dist_test = compute_dist_matrix(clu_test)
clu_test.head()

Unnamed: 0,movieId,genres,release_date,production_countries,original_language,runtime,budget,dendrogram
0,949,"[28, 80, 18, 53]",1995,[US],en,170.0,60000000,<__main__.dendrogram object at 0x000001B1C9A93...
1,710,"[12, 28, 53]",1995,"[GB, US]",en,130.0,58000000,<__main__.dendrogram object at 0x000001B1C9A93...
2,1408,"[28, 12]",1995,"[FR, DE, IT, US]",en,119.0,98000000,<__main__.dendrogram object at 0x000001B1C9A93...
3,524,"[18, 80]",1995,"[FR, US]",en,178.0,52000000,<__main__.dendrogram object at 0x000001B1C9A93...
4,4584,"[18, 10749]",1995,"[GB, US]",en,136.0,16500000,<__main__.dendrogram object at 0x000001B1C9A93...


In [44]:
root = clusterize(dist_test, clu_test)