### Import des librairies et lecture des fichiers


Est ce qu'on ajouterais pas les keywords ?

In [334]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re
import time

In [335]:
movies = pd.read_csv("movies_metadata.csv")
ratings = pd.read_csv("ratings_small.csv")

In [336]:
len(movies)

45466

In [337]:
movies[movies["id"]=="9273"]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
18,False,"{'id': 3167, 'name': 'Ace Ventura Collection',...",30000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",,9273,tt0112281,en,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi...",...,1995-11-10,212385533.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,New animals. New adventures. Same hair.,Ace Ventura: When Nature Calls,False,6.1,1128.0


### Nettoyage de la base de données et réduction de la matrice aux caractéristiques interéssantes

Suppression des id incorrects, des valeurs abérrantes, des lignes avec NaN, et modification des valeurs pour les rendre plus faciles à traiter.

On sélectionne les attributs de films qui semblent pertinents pour différencier les films sur leur contenu.
Ces choix sont arbitraires et on pourra être amenés à réfléchir dessus et à les modifier. Nous aurions voulu compléter notre base de données pour obtenir plus d'attributs (en particulier réalisateur, casting, mots clés), mais nous avons finalement du nous résigner à ne pas garder trop d'attributs pour limiter le temps de calcul.

In [338]:
def filter_correct_id(word): # MEILLEURE VERSION ICI
    if not isinstance(word, str) or re.fullmatch(r'[0-9]+', word):
        return word
    return "wrong_id"

In [339]:
movies[~movies.id.duplicated()]
movies.id = movies.id.apply(filter_correct_id)
movies = movies[movies.id != "wrong_id"] #3 id incorrect
movies.id = movies.id.astype('int64')
movies.budget = movies.budget.astype('int64')

Nous ne voulons garder que les films ayant reçu une note. Cela est une manière de ne garder qu'un nombre limité de films (il est très compliqué pour nous d'effectuer des calculs pour 45000 films) et cela pourrait être utile dans le cadre de la recommendation "user-based".

In [340]:
movies = movies.rename(columns={'id' : 'movieId'})
ratings = ratings.movieId.drop_duplicates()
movies = movies.merge(ratings, how='inner')

In [341]:
len(movies)

2831

In [342]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'movieId', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [343]:
cluster_features = movies[['movieId', 'genres', 'release_date', 'production_countries', 'original_language', 'runtime', 'budget']]
cluster_features = cluster_features.dropna()

In [344]:
print(len(cluster_features[cluster_features["runtime"]==0]))  #22 films à 0 minutes
#La durée d'un long métrage est d'au moins 40 minutes, ne gardons que les longs (et cela supprime les valeurs abérrantes) 
cluster_features = cluster_features[cluster_features["runtime"]>40]

22


In [345]:
print(len(cluster_features))
print(len(cluster_features[cluster_features["budget"]==0])) #beaucoup de films dont le budget n'est pas renseigné

2766
1507


In [346]:
def vectorize_genres(genres):
    if isinstance(genres, str):
        pattern = re.compile(r"'id': [0-9]*")
        return np.array([int(w[6:]) for w in pattern.findall(genres)])
    return genres

cluster_features.genres = cluster_features.genres.apply(vectorize_genres)

In [347]:
def simplify_date(date):
    if isinstance(date, str):
        return int(date[:4])
    return date

cluster_features.release_date = cluster_features.release_date.apply(simplify_date)

In [348]:
def simplify_countries(countries):
    if isinstance(countries, str):
        pattern = re.compile(r"'iso_3166_1': ...")
        return [w[15:] for w in pattern.findall(countries)]
    return genres

cluster_features.production_countries = cluster_features.production_countries.apply(simplify_countries)

In [349]:
cluster_features.head()

Unnamed: 0,movieId,genres,release_date,production_countries,original_language,runtime,budget
0,949,"[28, 80, 18, 53]",1995,[US],en,170.0,60000000
1,710,"[12, 28, 53]",1995,"[GB, US]",en,130.0,58000000
2,1408,"[28, 12]",1995,"[FR, DE, IT, US]",en,119.0,98000000
3,524,"[18, 80]",1995,"[FR, US]",en,178.0,52000000
4,4584,"[18, 10749]",1995,"[GB, US]",en,136.0,16500000


In [350]:
cluster_features.tail()

Unnamed: 0,movieId,genres,release_date,production_countries,original_language,runtime,budget
2824,2331,"[36, 18]",1999,"[CZ, DE, IT, US]",en,240.0,20000000
2826,80831,[18],2009,[ID],en,121.0,0
2827,3104,"[27, 878]",1967,[GB],en,92.0,0
2828,64197,"[10749, 18]",2007,[RU],ru,97.0,0
2829,98604,"[35, 10749]",2012,[RU],ru,91.0,0


In [351]:
len(cluster_features)

2766

### Définition d'une distance sur les films

In [352]:
MAX_YEAR = max(cluster_features.release_date) - min(cluster_features.release_date)

In [353]:
cluster_features.runtime.describe()

count    2766.000000
mean      107.479754
std        24.056597
min        43.000000
25%        93.000000
50%       103.000000
75%       118.000000
max       320.000000
Name: runtime, dtype: float64

In [354]:
cluster_features.budget.describe()

count    2.766000e+03
mean     1.427657e+07
std      3.146411e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.400000e+07
max      3.800000e+08
Name: budget, dtype: float64

Cette fonction calcule les similarités entre 2 Films. Plus les valeurs sont proches de 0, plus les films ont de points communs et plus elles sont proches de 1, plus ils sont différents (peut-être changer le nom ? instinctivement, j'aurais pensé que pour être très similaire il fallait un score élevé)

discuter de la manière de les calculer ?
mettre formule pour dire d'où viennent les std pris

In [355]:
def similarity_vect(mov1, mov2):
    m1 = mov1.iloc[0]
    m2 = mov2.iloc[0]
    simi_vect = pd.Series([])
    
    # SIMILARITIES IN GENRES
    
    g1 = m1.genres
    g2 = m2.genres
    if list(g1) and list(g2): #est ce utile ? y a t'il vraiment des genres vides ? combien ? les supprimer si oui ?
        g3 = np.append(g1, g2)
        simi_vect['genres'] = 2 * len(np.unique(g3)) / len(g3) - 1 # CHANGER PEUT ETRE
    else:
        simi_vect['genres'] = 1
        
    #SIMILARITIES FOR THE RELEASE DATE
    
    simi_vect['release_date'] = abs(m1.release_date - m2.release_date) / MAX_YEAR #
    
    #SIMILARITIES IN PRODUCTION COUNTRIES
    
    p1 = m1.production_countries
    p2 = m2.production_countries
    if list(p1) and list(p2):
        p3 = []
        p3.extend(p1)
        p3.extend(p2)
        simi_vect['production_countries'] = 2 * len(np.unique(p3)) / len(p3) - 1 # Changer aussi peut etre
    else:
        simi_vect['production_countries'] = 1
    
    #SIMILARITIES FOR THE LANGUAGE
    
    simi_vect['original_language'] = int(not m1.original_language == m2.original_language)
    
    #SIMILARITIES FOR THE RUNTIME
    
    r1 = m1.runtime
    r2 = m2.runtime
    #j'ai supprimé les films trop courts comme il y en avait très peu
    #if r1 == 0 or r2 == 0:
    #    simi_vect['runtime'] = 1 # Changer peut etre
    #else:
    simi_vect['runtime'] = min(1, abs(r1 - r2) / 55) # Choix 2 * std
    
    #SIMILARITIES FOR THE BUDGET
    
    b1 = m1.budget
    b2 = m2.budget
    if b1 == 0 or b2 == 0:
        simi_vect['budget'] = 1 # Changer peut etre
    else:
        simi_vect['budget'] = min(1, abs(b1 - b2) / (3 * 1.1e+07)) # La aussi (3 * ecart type ~ arbitraire)
    
    return simi_vect

In [358]:
mov1 = cluster_features[cluster_features.movieId == 1408]
mov2 = cluster_features[cluster_features.movieId == 524]
#print(mov1)
print(mov1.iloc[0])
print(mov2.iloc[0])

similarity_vect(mov1,mov2)

movieId                             1408
genres                          [28, 12]
release_date                        1995
production_countries    [FR, DE, IT, US]
original_language                     en
runtime                              119
budget                          98000000
Name: 2, dtype: object
movieId                      524
genres                  [18, 80]
release_date                1995
production_countries    [FR, US]
original_language             en
runtime                      178
budget                  52000000
Name: 3, dtype: object


genres                  1.000000
release_date            0.000000
production_countries    0.333333
original_language       0.000000
runtime                 1.000000
budget                  1.000000
dtype: float64

Cette fonction sert à donner des poids à chaque caractéristique.

In [361]:
def weight_vect(simi_vect, w_gen=3, w_rel=2, w_pro=1, w_ori=1, w_run=1, w_bud=1):
    weighted_vect = simi_vect.copy()
    weighted_vect.genres *= w_gen
    weighted_vect.release_date *= w_rel
    weighted_vect.production_countries *= w_pro
    weighted_vect.original_language *= w_ori
    weighted_vect.runtime *= w_run
    weighted_vect.budget *= w_bud
    
    return weighted_vect

In [362]:
def movie_distance(m1, m2, distance=lambda x: np.linalg.norm(x, ord=1)):
    simi_vect = similarity_vect(m1, m2)
    weighted_vect = weight_vect(simi_vect)
    return distance(weighted_vect)

In [363]:
movie_distance(mov1,mov2)

5.333333333333333

In [364]:
len(cluster_features["movieId"])

2766

In [365]:
len(cluster_features["movieId"].unique()) #Pourquoi un de moins ? Je croyais qu'on avait déjà éliminé les duplicatas ?

2765

In [177]:
def compute_dist_matrix(clu_fea):
    dist = []
    movies_id = clu_fea.movieId.unique() #unique() à retirer ? je pense qu'il n'est pas utile ?
    couples = []
    for i in range(len(movies_id)):
        u = movies_id[i]
        for j in range(i + 1, len(movies_id)):
            v = movies_id[j]
            couples.append((str(u), str(v)))
            dist.append(movie_distance(clu_fea[clu_fea.movieId == u], clu_fea[clu_fea.movieId == v]))
    
    index = pd.MultiIndex.from_tuples(couples, names=['u', 'v'])
    dist_mat = pd.Series(dist, index=index)
    
    return dist_mat

In [373]:
mini_test = cluster_features.iloc[0:10]
petit_test = cluster_features.iloc[0:100]

In [38]:
#start_time = time.time()
#m = compute_dist_matrix(cluster_features) # environ 1 heure
#print(m)
#print("Temps d execution : %s secondes ---" % (time.time() - start_time))

u      v    
949    710      2.406926
       1408     4.527273
       524      1.721212
       4584     3.951515
       5        4.000000
                  ...   
3104   98604    6.774484
       49280    8.126050
64197  98604    2.693125
       49280    8.798319
98604  49280    8.882353
Length: 3991725, dtype: float64
Temps d execution : 24088.50949215889 secondes ---


In [377]:
start_time = time.time()
ma = compute_dist_matrix(mini_test) # environ 1 heure
print(ma)
print("Temps d execution : %s secondes ---" % (time.time() - start_time))

u     v   
949   710     2.406926
      1408    4.527273
      524     1.721212
      4584    3.951515
      5       4.000000
      8012    3.187229
      451     4.000000
      902     7.000000
      63      3.812554
710   1408    2.466667
      524     4.554545
      4584    4.109091
      5       4.915152
      8012    3.628788
      451     4.660606
      902     5.400000
      63      3.215152
1408  524     5.333333
      4584    4.975758
      5       4.981818
      8012    4.854545
      451     4.727273
      902     4.428571
      63      4.781818
524   4584    3.763636
      5       3.833333
      8012    3.792424
      451     3.833333
      902     6.600000
      63      4.906061
4584  5       4.403030
      8012    4.313636
      451     1.160606
      902     5.554545
      63      3.854545
5     8012    1.522727
      451     3.266667
      902     5.606061
      63      4.336364
8012  451     3.934848
      902     5.425758
      63      2.459091
451   902     5.509091


In [376]:
start_time = time.time()
mat = compute_dist_matrix(petit_test) # environ 1 heure
print(mat)
print("Temps d execution : %s secondes ---" % (time.time() - start_time))

u     v   
949   710     2.406926
      1408    4.527273
      524     1.721212
      4584    3.951515
      5       4.000000
                ...   
1245  2088    2.618182
      424     3.451515
7007  2088    2.216450
      424     4.000000
2088  424     3.794372
Length: 4950, dtype: float64
Temps d execution : 72.04397654533386 secondes ---


In [39]:
class dendrogram:
    def __init__(self, leaf=None):
        self.leaf = leaf
        self.leaf_nb = 1
        self.father = None
        self.left = None
        self.right = None
    
    def set_leaf_nb(self):
        total_leaf_nb = 0
        if self.left is not None:
            total_leaf_nb += self.left.leaf_nb
        if self.right is not None:
            total_leaf_nb += self.right.leaf_nb
        self.leaf_nb = max(1, total_leaf_nb)
    
    def get_id_list(self):
        id_list = []
        def prefix(node):
            if node.leaf is not None:
                id_list.append(node.leaf)
            else:
                prefix(node.right)
                prefix(node.left)
        
        prefix(self)
        return id_list
    
    def get_root(self):
        tmp = self
        while tmp.father is not None: tmp = tmp.father
        return tmp

In [40]:
cluster_features["dendrogram"] = cluster_features.movieId.apply(lambda x: dendrogram(leaf=x))

In [41]:
def weighted_mean(x1, w1, x2, w2):
    return (w1 * x1 + w2 * x2) / (w1 + w2)

In [42]:
def clusterize(dist_mat, clu_fea):
    
    size_mat = clu_fea.shape[0]
    for cpt in range(1, size_mat):
        id_list = [tup[1] for tup in dist_mat.index[: size_mat - cpt]]
        id_list.append(dist_mat.index[0][0])
        index_str1, index_str2 = pd.Series.idxmin(dist_mat)
        index1, index2 = int(index_str1), int(index_str2)
        mov1 = clu_fea[clu_fea.movieId == index1].iloc[0]
        mov2 = clu_fea[clu_fea.movieId == index2].iloc[0]
        tmp1 = mov1.dendrogram
        tmp2 = mov2.dendrogram
        while tmp1.father is not None: tmp1 = tmp1.father
        while tmp2.father is not None: tmp2 = tmp2.father
        tmp3 = dendrogram()
        tmp3.left = tmp1
        tmp3.right = tmp2
        tmp3.set_leaf_nb()
        tmp1.father = tmp3
        tmp2.father = tmp3
        for x in id_list:
            if x == index_str1 or x == index_str2 : continue
            if (index_str1, x) in dist_mat.index:
                if (index_str2, x) in dist_mat:
                    tmp = weighted_mean(dist_mat.loc[index_str1, x], tmp1.leaf_nb,
                                        dist_mat.loc[index_str2, x], tmp2.leaf_nb)
                else:
                    tmp = weighted_mean(dist_mat.loc[index_str1, x], tmp1.leaf_nb,
                                        dist_mat.loc[x, index_str2], tmp2.leaf_nb)
                dist_mat.loc[index_str1, x] = tmp
            else:
                if (index_str2, x) in dist_mat:
                    tmp = weighted_mean(dist_mat.loc[x, index_str1], tmp1.leaf_nb,
                                        dist_mat.loc[index_str2, x], tmp2.leaf_nb)
                else:
                    tmp = weighted_mean(dist_mat.loc[x, index_str1], tmp1.leaf_nb,
                                        dist_mat.loc[x, index_str2], tmp2.leaf_nb)
                dist_mat.loc[x, index_str1] = tmp
        dist_mat = dist_mat.drop(index_str2, level='u')
        dist_mat = dist_mat.drop(index_str2, level='v')
    
    return clu_fea.iloc[0].dendrogram.get_root()

In [43]:
clu_test = cluster_features.iloc[:10]
dist_test = compute_dist_matrix(clu_test)
clu_test.head()

Unnamed: 0,movieId,genres,release_date,production_countries,original_language,runtime,budget,dendrogram
0,949,"[28, 80, 18, 53]",1995,[US],en,170.0,60000000,<__main__.dendrogram object at 0x000001B1C9A93...
1,710,"[12, 28, 53]",1995,"[GB, US]",en,130.0,58000000,<__main__.dendrogram object at 0x000001B1C9A93...
2,1408,"[28, 12]",1995,"[FR, DE, IT, US]",en,119.0,98000000,<__main__.dendrogram object at 0x000001B1C9A93...
3,524,"[18, 80]",1995,"[FR, US]",en,178.0,52000000,<__main__.dendrogram object at 0x000001B1C9A93...
4,4584,"[18, 10749]",1995,"[GB, US]",en,136.0,16500000,<__main__.dendrogram object at 0x000001B1C9A93...


In [44]:
root = clusterize(dist_test, clu_test)