In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re

In [2]:
def filter_correct_id(word): # MEILLEURE VERSION ICI
    if not isinstance(word, str) or re.fullmatch(r'[0-9]+', word):
        return word
    return "wrong_id"

In [9]:
movies = pd.read_csv("movies_metadata.csv")

In [116]:
movies = movies[~movies.id.duplicated()]
movies.id = movies.id.apply(filter_correct_id)
movies = movies[movies.id != "wrong_id"]
movies.id = movies.id.astype('int64')
movies.budget = movies.budget.astype('int64')

On sélectionne les attributs de films qui semblent pertinents pour différencier les films sur leur contenu.
Ces choix sont arbitraires et on pourra être amenés à réfléchir dessus et à les modifier.

In [117]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [118]:
cluster_features = movies[['genres', 'release_date', 'production_countries', 'original_language', 'runtime', 'budget']]
cluster_features = cluster_features.dropna()

In [119]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,81.0,30000000
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,104.0,65000000
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,101.0,0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,127.0,16000000
4,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,106.0,0


On modifie les attributs afin de pouvoir facilement comparer les films

In [120]:
def vectorize_genres(genres):
    if isinstance(genres, str):
        pattern = re.compile(r"'id': [0-9]*")
        return np.array([int(w[6:]) for w in pattern.findall(genres)])
    return genres

In [121]:
cluster_features.genres = cluster_features.genres.apply(vectorize_genres)

In [122]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[16, 35, 10751]",1995-10-30,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,81.0,30000000
1,"[12, 14, 10751]",1995-12-15,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,104.0,65000000
2,"[10749, 35]",1995-12-22,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,101.0,0
3,"[35, 18, 10749]",1995-12-22,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,127.0,16000000
4,[35],1995-02-10,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,106.0,0


In [123]:
def simplify_date(date):
    if isinstance(date, str):
        return int(date[:4])
    return date

In [124]:
cluster_features.release_date = cluster_features.release_date.apply(simplify_date)

In [125]:
cluster_features.head()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
0,"[16, 35, 10751]",1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,81.0,30000000
1,"[12, 14, 10751]",1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,104.0,65000000
2,"[10749, 35]",1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,101.0,0
3,"[35, 18, 10749]",1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,127.0,16000000
4,[35],1995,"[{'iso_3166_1': 'US', 'name': 'United States o...",en,106.0,0


In [126]:
def simplify_countries(countries):
    if isinstance(countries, str):
        pattern = re.compile(r"'iso_3166_1': ...")
        return [w[15:] for w in pattern.findall(countries)]
    return genres

In [127]:
cluster_features.production_countries = cluster_features.production_countries.apply(simplify_countries)

In [128]:
cluster_features.tail()

Unnamed: 0,genres,release_date,production_countries,original_language,runtime,budget
45460,"[18, 28, 10749]",1991,"[CA, DE, GB, US]",en,104.0,0
45462,[18],2011,[PH],tl,360.0,0
45463,"[28, 18, 53]",2003,[US],en,90.0,0
45464,[],1917,[RU],en,87.0,0
45465,[],2017,[GB],en,75.0,0


On va maintenant définir une distance sur les films

In [129]:
MAX_YEAR = max(cluster_features.release_date) - min(cluster_features.release_date)

In [135]:
cluster_features.runtime.describe()

count    45089.000000
mean        94.187540
std         38.343351
min          0.000000
25%         85.000000
50%         95.000000
75%        107.000000
max       1256.000000
Name: runtime, dtype: float64

In [133]:
cluster_features.budget.describe()

count    4.508900e+04
mean     4.256248e+06
std      1.749051e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.800000e+08
Name: budget, dtype: float64

In [188]:
def similarity_vect(m1, m2):
    simi_vect = m1.copy()
    g1, g2 = m1.genres, m2.genres
    g3 = np.append(g1, g2)
    simi_vect.genres = 2 * len(np.unique(g3)) / len(g3) - 1 # CHANGER PEUT ETRE
    
    simi_vect.release_date = abs(m1.release_date - m2.release_date) / MAX_YEAR
    
    p1, p2, p3 = m1.production_countries, m2.production_countries, []
    p3.extend(p1)
    p3.extend(p2)
    simi_vect.production_countries = 2 * len(np.unique(p3)) / len(p3) - 1 # Changer aussi peut etre
    
    simi_vect.original_language = int(not m1.original_language == m2.original_language)
    
    r1, r2 = m1.runtime, m2.runtime
    if r1 == 0 or r2 == 0:
        simi_vect.runtime = 0.5 # Changer peut etre
    else:
        simi_vect.runtime = min(1, abs(r1 - r2) / 38)
    
    b1, b2 = m1.budget, m2.budget
    if b1 == 0 or b2 == 0:
        simi_vect.budget = 0.5 # Changer peut etre
    else:
        simi_vect.budget = min(1, abs(b1 - b2) / (3 * 1.75e+07)) # La aussi (3 * ecart type ~ arbitraire)
    
    return simi_vect

In [189]:
def weight_vect(simi_vect, w_gen=3, w_rel=2, w_pro=1, w_ori=1, w_run=1, w_bud=1):
    weighted_vect = simi_vect.copy()
    weighted_vect.genres *= w_gen
    weighted_vect.release_date *= w_rel
    weighted_vect.production_countries *= w_pro
    weighted_vect.original_language *= w_ori
    weighted_vect.runtime *= w_run
    weighted_vect.budget *= w_bud
    
    return weighted_vect

In [190]:
v=similarity_vect(x1, x2)
print(v)
weight_vect(v)

genres                  0.666667
release_date                   0
production_countries           0
original_language              0
runtime                 0.605263
budget                  0.666667
Name: 0, dtype: object


genres                         2
release_date                   0
production_countries           0
original_language              0
runtime                 0.605263
budget                  0.666667
Name: 0, dtype: object

In [193]:
def movie_distance(m1, m2):
    simi_vect = similarity_vect(m1, m2)
    weighted_vect = weight_vect(simi_vect)
    return np.linalg.norm(weighted_vect)