# Import des modules

In [141]:
import pandas as pd
import numpy as np
import warnings
import pickle

# Désactiver les FutureWarnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [142]:
# Afficher toutes les colonnes
pd.set_option('display.max_columns', None)
# Afficher toutes les lignes
pd.set_option('display.max_rows', 600)

# Chargement du fichier

In [143]:
df = pd.read_csv('all_in_cleaned.csv',sep=',')

# infos du fichier

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1148349 entries, 0 to 1148348
Data columns (total 35 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   tconst                        1148349 non-null  object 
 1   primaryTitle                  1148349 non-null  object 
 2   originalTitle                 1148349 non-null  object 
 3   startYear                     1140299 non-null  object 
 4   runtimeMinutes                1102659 non-null  float64
 5   genre_imdb                    1148349 non-null  object 
 6   frenchTitle                   1148349 non-null  object 
 7   averageRating                 1036441 non-null  float64
 8   numVotes                      1036441 non-null  float64
 9   backdrop_path                 750227 non-null   object 
 10  budget                        927908 non-null   float64
 11  homepage                      927908 non-null   object 
 12  id                          

In [145]:
df.columns
#à garder en numérique ['startYear','runtimeMinutes',''averageRating','numVotes']
#à get_dummies ['genre_imdb','produciton_countries','original_language']

Index(['tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes', 'genre_imdb', 'frenchTitle', 'averageRating',
       'numVotes', 'backdrop_path', 'budget', 'homepage', 'id',
       'original_language', 'overview', 'popularity', 'poster_path',
       'production_countries', 'revenue', 'spoken_languages', 'status',
       'title', 'video', 'vote_average', 'vote_count',
       'production_companies_name', 'production_companies_country', 'nconst',
       'category', 'characters', 'primaryName', 'birthYear', 'deathYear',
       'primaryProfession', 'knownForTitles'],
      dtype='object')

In [146]:
df.isna().sum()

tconst                               0
primaryTitle                         0
originalTitle                        0
startYear                         8050
runtimeMinutes                   45690
genre_imdb                           0
frenchTitle                          0
averageRating                   111908
numVotes                        111908
backdrop_path                   398122
budget                          220441
homepage                        220441
id                              220441
original_language               220441
overview                        247088
popularity                      220441
poster_path                     246988
production_countries            220441
revenue                         220441
spoken_languages                220441
status                          220441
title                           220441
video                           220441
vote_average                    220441
vote_count                      220441
production_companies_name

In [147]:
df.nunique()

tconst                           86503
primaryTitle                     79643
originalTitle                    81746
startYear                          126
runtimeMinutes                     341
genre_imdb                         921
frenchTitle                      80342
averageRating                       90
numVotes                         17502
backdrop_path                    48496
budget                            1784
homepage                          9211
id                               61548
original_language                  113
overview                         59517
popularity                       16587
poster_path                      59466
production_countries              3810
revenue                           9852
spoken_languages                  2950
status                               6
title                            56471
video                                2
vote_average                      4094
vote_count                        3533
production_companies_name

# Retraitements

In [148]:
#suppression des NA (avant nb lignes = 1148349, après = 1027341 )
df.dropna(subset = ['startYear','runtimeMinutes','averageRating','numVotes'],  inplace = True)

In [149]:
df[['startYear','runtimeMinutes','averageRating','numVotes']].isna().sum()

startYear         0
runtimeMinutes    0
averageRating     0
numVotes          0
dtype: int64

In [150]:
#Transformation des types
df['startYear'] = pd.to_datetime(df['startYear']).dt.year
df['runtimeMinutes'] = pd.to_numeric(df['runtimeMinutes'], errors='coerce')

In [151]:
#je ne garde que les colonnes qui m'intéressent dans un nouveau dataframe
df_test = df[['tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes', 'genre_imdb', 'frenchTitle', 'averageRating',
       'numVotes','original_language','production_countries','nconst']]
df_test = df_test.drop_duplicates()

# Affichage du résultat
display(df_test)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genre_imdb,frenchTitle,averageRating,numVotes,original_language,production_countries,nconst
0,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],nm0906197
1,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],nm0332182
2,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],nm1323543
3,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],nm1759558
4,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],nm0141150
...,...,...,...,...,...,...,...,...,...,...,...,...
1148344,tt9916362,Coven,Akelarre,2020,92.0,"Drama,History",Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",nm2970042
1148345,tt9916362,Coven,Akelarre,2020,92.0,"Drama,History",Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",nm4065853
1148346,tt9916362,Coven,Akelarre,2020,92.0,"Drama,History",Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",nm1086949
1148347,tt9916362,Coven,Akelarre,2020,92.0,"Drama,History",Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",nm5813626


In [152]:
#Explode du genre imdb
df_test['genre_imdb'] = df_test['genre_imdb'].apply(lambda x: x.split(','))
df_test = df_test.explode('genre_imdb', ignore_index=True)

In [153]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1978372 entries, 0 to 1978371
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   tconst                object 
 1   primaryTitle          object 
 2   originalTitle         object 
 3   startYear             int32  
 4   runtimeMinutes        float64
 5   genre_imdb            object 
 6   frenchTitle           object 
 7   averageRating         float64
 8   numVotes              float64
 9   original_language     object 
 10  production_countries  object 
 11  nconst                object 
dtypes: float64(3), int32(1), object(8)
memory usage: 173.6+ MB


# Get Dummies

In [154]:
df_test.columns

Index(['tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes', 'genre_imdb', 'frenchTitle', 'averageRating',
       'numVotes', 'original_language', 'production_countries', 'nconst'],
      dtype='object')

In [155]:
df_test.nunique()

tconst                   69608
primaryTitle             63894
originalTitle            65990
startYear                  120
runtimeMinutes             310
genre_imdb                  24
frenchTitle              65121
averageRating               89
numVotes                 17500
original_language          113
production_countries      3795
nconst                  339963
dtype: int64

In [156]:
#Création d'un df avec uniquement les lignes qui contiennent FR ou US en pays de production
df_testfrus = df_test[df_test['production_countries'].apply(lambda x: isinstance(x, str) and ('FR' in x or 'US' in x))]


#limiter aux films avec au moins 400 000 de votes
df_testfrus400000 = df_testfrus[df_testfrus['numVotes']>= 300000]

# Compter le nombre d'occurrences de chaque valeur dans 'nconst'
counts = df_testfrus400000['nconst'].value_counts()

In [157]:
counts

nconst
nm0001877    111
nm0270559     91
nm0002354     87
nm0315974     78
nm0000384     74
            ... 
nm0462712      1
nm0035488      1
nm0943391      1
nm0801737      1
nm0035905      1
Name: count, Length: 6000, dtype: int64

In [158]:
# Créer une condition pour identifier les nconst apparaissant moins de 50 fois
mask = df_test['nconst'].isin(counts[counts >= 20
].index)
# Remplacer les 'nconst' qui n'apparaissent pas au moins 50 fois par NaN
df_test.loc[~mask, 'nconst'] = ""

In [159]:
df_dum =  pd.concat([df_test[['tconst','frenchTitle',  'startYear',
       'runtimeMinutes','numVotes','averageRating','nconst']], 
    pd.get_dummies(df_test['genre_imdb'], prefix='genre', dtype=int),                  # Encodage de 'genre_imdb'
    pd.get_dummies(df_test['original_language'], prefix='lang', dtype=int),
    pd.get_dummies(df_test['nconst'], prefix='', dtype=int)              # Encodage de 'original_language'
    ], axis=1)

In [160]:
df_dum

Unnamed: 0,tconst,frenchTitle,startYear,runtimeMinutes,numVotes,averageRating,nconst,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Film-Noir,genre_History,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_News,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_Unknown,genre_War,genre_Western,lang_af,lang_am,lang_ar,lang_as,lang_ay,lang_az,lang_be,lang_bg,lang_bm,lang_bn,lang_bo,lang_bs,lang_ca,lang_ce,lang_cn,lang_cs,lang_cy,lang_da,lang_de,lang_dz,lang_el,lang_en,lang_eo,lang_es,lang_et,lang_eu,lang_fa,lang_ff,lang_fi,lang_fr,lang_ga,lang_gl,lang_gu,lang_ha,lang_he,lang_hi,lang_hr,lang_ht,lang_hu,lang_hy,lang_id,lang_is,lang_it,lang_iu,lang_ja,lang_jv,lang_ka,lang_kk,lang_kl,lang_km,lang_kn,lang_ko,lang_ks,lang_ku,lang_ky,lang_la,lang_lb,lang_lo,lang_lt,lang_lv,lang_mg,lang_mi,lang_mk,lang_ml,lang_mn,lang_mo,lang_mr,lang_ms,lang_mt,lang_my,lang_nb,lang_ne,lang_nl,lang_no,lang_ny,lang_oc,lang_pa,lang_pl,lang_ps,lang_pt,lang_qu,lang_ro,lang_ru,lang_rw,lang_se,lang_sh,lang_si,lang_sk,lang_sl,lang_so,lang_sq,lang_sr,lang_st,lang_su,lang_sv,lang_sw,lang_ta,lang_te,lang_tg,lang_th,lang_tl,lang_tn,lang_tr,lang_uk,lang_ur,lang_uz,lang_vi,lang_wo,lang_xh,lang_xx,lang_yi,lang_zh,lang_zu,_,_nm0000035,_nm0000093,_nm0000113,_nm0000115,_nm0000116,_nm0000123,_nm0000128,_nm0000129,_nm0000134,_nm0000136,_nm0000138,_nm0000139,_nm0000146,_nm0000148,_nm0000151,_nm0000158,_nm0000168,_nm0000179,_nm0000184,_nm0000191,_nm0000194,_nm0000198,_nm0000199,_nm0000204,_nm0000206,_nm0000217,_nm0000226,_nm0000229,_nm0000233,_nm0000243,_nm0000244,_nm0000246,_nm0000255,_nm0000288,_nm0000307,_nm0000318,_nm0000323,_nm0000332,_nm0000353,_nm0000354,_nm0000355,_nm0000375,_nm0000384,_nm0000399,_nm0000401,_nm0000422,_nm0000437,_nm0000474,_nm0000553,_nm0000631,_nm0000686,_nm0000705,_nm0000881,_nm0000949,_nm0000982,_nm0000988,_nm0001060,_nm0001392,_nm0001401,_nm0001426,_nm0001570,_nm0001741,_nm0001774,_nm0001804,_nm0001873,_nm0001877,_nm0001937,_nm0001980,_nm0002353,_nm0002354,_nm0003911,_nm0004056,_nm0004170,_nm0004581,_nm0004874,_nm0004937,_nm0004976,_nm0005023,_nm0005086,_nm0005212,_nm0005271,_nm0005351,_nm0005428,_nm0006035,_nm0006133,_nm0006290,_nm0006293,_nm0006904,_nm0009190,_nm0010736,_nm0032696,_nm0061045,_nm0065100,_nm0085312,_nm0089217,_nm0117290,_nm0149556,_nm0177896,_nm0185819,_nm0225146,_nm0230032,_nm0252230,_nm0252961,_nm0254645,_nm0262635,_nm0269463,_nm0270559,_nm0290556,_nm0315974,_nm0326040,_nm0330428,_nm0331516,_nm0342488,_nm0350453,_nm0362766,_nm0382268,_nm0413011,_nm0413168,_nm0424060,_nm0432725,_nm0454752,_nm0456158,_nm0460141,_nm0462895,_nm0476064,_nm0498278,_nm0517589,_nm0550881,_nm0564215,_nm0586969,_nm0605775,_nm0634240,_nm0649460,_nm0650038,_nm0662748,_nm0670408,_nm0694173,_nm0695435,_nm0705356,_nm0719637,_nm0736622,_nm0744429,_nm0746273,_nm0746830,_nm0748784,_nm0749263,_nm0757855,_nm0785227,_nm0795682,_nm0799777,_nm0811583,_nm0835016,_nm0836121,_nm0842770,_nm0858799,_nm0876138,_nm0914612,_nm1046097,_nm1055413,_nm1165110,_nm1176985,_nm1209966,_nm1212722,_nm1297015,_nm1334526,_nm1706767,_nm1727304,_nm2003463,_nm2225369,_nm2273444,_nm2554352,_nm2933757,_nm3234869
0,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978367,tt9916362,Les sorcières d'Akelarre,2020,92.0,5990.0,6.4,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1978368,tt9916362,Les sorcières d'Akelarre,2020,92.0,5990.0,6.4,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1978369,tt9916362,Les sorcières d'Akelarre,2020,92.0,5990.0,6.4,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1978370,tt9916362,Les sorcières d'Akelarre,2020,92.0,5990.0,6.4,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [161]:
df_test.sample(10)

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genre_imdb,frenchTitle,averageRating,numVotes,original_language,production_countries,nconst
1365701,tt1247662,The Good Guy,The Good Guy,2009,90.0,Comedy,Le Fiancé Idéal,5.7,9882.0,en,['US'],
394334,tt0059491,Cloportes,La métamorphose des cloportes,1965,95.0,Crime,La métamorphose des cloportes,6.6,620.0,fr,"['FR', 'IT']",
1866631,tt6489962,1917 - Der wahre Oktober,1917 - Der wahre Oktober,2017,90.0,Animation,1917 La verité sur Octobre,7.2,45.0,de,"['DE', 'CH']",
1955280,tt9103710,To Kill the Dragon,Matar al dragon,2019,83.0,Horror,Le sang du dragon,4.4,115.0,es,['AR'],
1482421,tt1594562,The Innkeepers,The Innkeepers,2011,101.0,Horror,The Innkeepers,5.5,38166.0,en,['US'],
1375692,tt1277953,Madagascar 3: Europe's Most Wanted,Madagascar 3: Europe's Most Wanted,2012,93.0,Comedy,Madagascar 3 : Bons baisers d'Europe,6.8,204979.0,en,['US'],
123083,tt0032904,The Philadelphia Story,The Philadelphia Story,1940,112.0,Romance,Indiscrétions,7.8,75566.0,en,['US'],
300985,tt0051430,The Bonnie Parker Story,The Bonnie Parker Story,1958,79.0,Biography,The Bonnie Parker Story,5.8,495.0,en,['US'],
1465068,tt15385142,Cadejo Blanco,Cadejo Blanco,2021,125.0,Thriller,Infiltrée,6.6,205.0,es,['GT'],
493483,tt0069198,The Ruling Class,The Ruling Class,1972,154.0,Comedy,Dieu et mon droit,7.2,6985.0,en,['GB'],


# KNN 

In [162]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [163]:
print(df_dum.columns.tolist())

['tconst', 'frenchTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'averageRating', 'nconst', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Film-Noir', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Musical', 'genre_Mystery', 'genre_News', 'genre_Romance', 'genre_Sci-Fi', 'genre_Sport', 'genre_Thriller', 'genre_Unknown', 'genre_War', 'genre_Western', 'lang_af', 'lang_am', 'lang_ar', 'lang_as', 'lang_ay', 'lang_az', 'lang_be', 'lang_bg', 'lang_bm', 'lang_bn', 'lang_bo', 'lang_bs', 'lang_ca', 'lang_ce', 'lang_cn', 'lang_cs', 'lang_cy', 'lang_da', 'lang_de', 'lang_dz', 'lang_el', 'lang_en', 'lang_eo', 'lang_es', 'lang_et', 'lang_eu', 'lang_fa', 'lang_ff', 'lang_fi', 'lang_fr', 'lang_ga', 'lang_gl', 'lang_gu', 'lang_ha', 'lang_he', 'lang_hi', 'lang_hr', 'lang_ht', 'lang_hu', 'lang_hy', 'lang_id', 'lang_is', 'lang_it', 'lang_iu', 'lang_ja', 'lang_jv

In [164]:
# Sélectionner les colonnes des genres
genre_cols = [col for col in df_dum.columns if col.startswith("genre_")]

# Calculer le nombre total de genres pour chaque film
genre_sums = df_dum[genre_cols].sum(axis=1)


# Normaliser en divisant chaque genre par le nombre total de genres du film
df_dum[genre_cols] = df_dum[genre_cols].div(genre_sums, axis=0)

In [165]:
# Identifier les colonnes des genres et des langues
genre_columns = [col for col in df_dum.columns if col.startswith('genre_')]
lang_columns = [col for col in df_dum.columns if col.startswith('lang_')]
nconst_columns = [col for col in df_dum.columns if col.startswith('_')]

# Convertir les colonnes booléennes en entiers (True -> 1, False -> 0)
df_dum[genre_columns + lang_columns+nconst_columns] = df_dum[genre_columns + lang_columns+nconst_columns].astype(int)

# Effectuer un groupby sur 'tconst' pour regrouper par film et appliquer sum() sur les genres et langues
aggregation_dict = {
    'startYear': 'first',
    'runtimeMinutes': 'first',
    'numVotes': 'first',
    'averageRating': 'first',
}

# Ajouter les genres et langues dans l'agrégation avec sum()
aggregation_dict.update({col: 'max' for col in genre_columns + lang_columns+ nconst_columns})

# Appliquer l'agrégation
df_aggregated = df_dum.groupby(['tconst','frenchTitle']).agg(aggregation_dict)

# Réorganiser les colonnes pour que 'tconst' soit la première
df_aggregated = df_aggregated.reset_index()  # Garder tconst comme une colonne
df_aggregated = df_aggregated[[col for col in ['tconst'] + [c for c in df_aggregated.columns if c != 'tconst']]]  # Réorganiser

# Afficher le résultat
df_aggregated

Unnamed: 0,tconst,frenchTitle,startYear,runtimeMinutes,numVotes,averageRating,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Film-Noir,genre_History,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_News,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_Unknown,genre_War,genre_Western,lang_af,lang_am,lang_ar,lang_as,lang_ay,lang_az,lang_be,lang_bg,lang_bm,lang_bn,lang_bo,lang_bs,lang_ca,lang_ce,lang_cn,lang_cs,lang_cy,lang_da,lang_de,lang_dz,lang_el,lang_en,lang_eo,lang_es,lang_et,lang_eu,lang_fa,lang_ff,lang_fi,lang_fr,lang_ga,lang_gl,lang_gu,lang_ha,lang_he,lang_hi,lang_hr,lang_ht,lang_hu,lang_hy,lang_id,lang_is,lang_it,lang_iu,lang_ja,lang_jv,lang_ka,lang_kk,lang_kl,lang_km,lang_kn,lang_ko,lang_ks,lang_ku,lang_ky,lang_la,lang_lb,lang_lo,lang_lt,lang_lv,lang_mg,lang_mi,lang_mk,lang_ml,lang_mn,lang_mo,lang_mr,lang_ms,lang_mt,lang_my,lang_nb,lang_ne,lang_nl,lang_no,lang_ny,lang_oc,lang_pa,lang_pl,lang_ps,lang_pt,lang_qu,lang_ro,lang_ru,lang_rw,lang_se,lang_sh,lang_si,lang_sk,lang_sl,lang_so,lang_sq,lang_sr,lang_st,lang_su,lang_sv,lang_sw,lang_ta,lang_te,lang_tg,lang_th,lang_tl,lang_tn,lang_tr,lang_uk,lang_ur,lang_uz,lang_vi,lang_wo,lang_xh,lang_xx,lang_yi,lang_zh,lang_zu,_,_nm0000035,_nm0000093,_nm0000113,_nm0000115,_nm0000116,_nm0000123,_nm0000128,_nm0000129,_nm0000134,_nm0000136,_nm0000138,_nm0000139,_nm0000146,_nm0000148,_nm0000151,_nm0000158,_nm0000168,_nm0000179,_nm0000184,_nm0000191,_nm0000194,_nm0000198,_nm0000199,_nm0000204,_nm0000206,_nm0000217,_nm0000226,_nm0000229,_nm0000233,_nm0000243,_nm0000244,_nm0000246,_nm0000255,_nm0000288,_nm0000307,_nm0000318,_nm0000323,_nm0000332,_nm0000353,_nm0000354,_nm0000355,_nm0000375,_nm0000384,_nm0000399,_nm0000401,_nm0000422,_nm0000437,_nm0000474,_nm0000553,_nm0000631,_nm0000686,_nm0000705,_nm0000881,_nm0000949,_nm0000982,_nm0000988,_nm0001060,_nm0001392,_nm0001401,_nm0001426,_nm0001570,_nm0001741,_nm0001774,_nm0001804,_nm0001873,_nm0001877,_nm0001937,_nm0001980,_nm0002353,_nm0002354,_nm0003911,_nm0004056,_nm0004170,_nm0004581,_nm0004874,_nm0004937,_nm0004976,_nm0005023,_nm0005086,_nm0005212,_nm0005271,_nm0005351,_nm0005428,_nm0006035,_nm0006133,_nm0006290,_nm0006293,_nm0006904,_nm0009190,_nm0010736,_nm0032696,_nm0061045,_nm0065100,_nm0085312,_nm0089217,_nm0117290,_nm0149556,_nm0177896,_nm0185819,_nm0225146,_nm0230032,_nm0252230,_nm0252961,_nm0254645,_nm0262635,_nm0269463,_nm0270559,_nm0290556,_nm0315974,_nm0326040,_nm0330428,_nm0331516,_nm0342488,_nm0350453,_nm0362766,_nm0382268,_nm0413011,_nm0413168,_nm0424060,_nm0432725,_nm0454752,_nm0456158,_nm0460141,_nm0462895,_nm0476064,_nm0498278,_nm0517589,_nm0550881,_nm0564215,_nm0586969,_nm0605775,_nm0634240,_nm0649460,_nm0650038,_nm0662748,_nm0670408,_nm0694173,_nm0695435,_nm0705356,_nm0719637,_nm0736622,_nm0744429,_nm0746273,_nm0746830,_nm0748784,_nm0749263,_nm0757855,_nm0785227,_nm0795682,_nm0799777,_nm0811583,_nm0835016,_nm0836121,_nm0842770,_nm0858799,_nm0876138,_nm0914612,_nm1046097,_nm1055413,_nm1165110,_nm1176985,_nm1209966,_nm1212722,_nm1297015,_nm1334526,_nm1706767,_nm1727304,_nm2003463,_nm2225369,_nm2273444,_nm2554352,_nm2933757,_nm3234869
0,tt0000591,L'enfant prodigue,1907,90.0,31.0,5.6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tt0001285,La Vie de Moïse,1909,50.0,63.0,5.5,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tt0001614,Les Quatre Diables,1911,60.0,43.0,6.5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tt0001790,Les misérables - Époque 1: Jean Valjean,1913,60.0,57.0,6.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,tt0002130,L'Enfer,1911,71.0,3702.0,7.0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69603,tt9911196,De Beentjes van Sint-Hildegard,2020,103.0,3416.0,7.4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69604,tt9913084,Diabolik sono io,2019,75.0,56.0,6.7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69605,tt9913936,Le paradis de Diego,2019,135.0,61.0,7.4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69606,tt9915790,Bobbyr Bondhura,2019,106.0,44.0,7.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [166]:
#Entrainement du modèle sur tous les films
df_numeric = df_aggregated.select_dtypes(include=['number']) 
scaler = StandardScaler()
X_scaled= scaler.fit_transform(df_numeric)

In [167]:
#code pour afficher les colonnes avec leur index
liste = ['startYear', 'runtimeMinutes', 'numVotes', 'averageRating', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Film-Noir', 'genre_History', 'genre_Horror', 'genre_Music', 'genre_Musical', 'genre_Mystery', 'genre_News', 'genre_Romance', 'genre_Sci-Fi', 'genre_Sport', 'genre_Thriller', 'genre_Unknown', 'genre_War', 'genre_Western', 'lang_af', 'lang_am', 'lang_ar', 'lang_as', 'lang_ay', 'lang_az', 'lang_be', 'lang_bg', 'lang_bm', 'lang_bn', 'lang_bo', 'lang_bs', 'lang_ca', 'lang_ce', 'lang_cn', 'lang_cs', 'lang_cy', 'lang_da', 'lang_de', 'lang_dz', 'lang_el', 'lang_en', 'lang_eo', 'lang_es', 'lang_et', 'lang_eu', 'lang_fa', 'lang_ff', 'lang_fi', 'lang_fr', 'lang_ga', 'lang_gl', 'lang_gu', 'lang_ha', 'lang_he', 'lang_hi', 'lang_hr', 'lang_ht', 'lang_hu', 'lang_hy', 'lang_id', 'lang_is', 'lang_it', 'lang_iu', 'lang_ja', 'lang_jv', 'lang_ka', 'lang_kk', 'lang_kl', 'lang_km', 'lang_kn', 'lang_ko', 'lang_ks', 'lang_ku', 'lang_ky', 'lang_la', 'lang_lb', 'lang_lo', 'lang_lt', 'lang_lv', 'lang_mg', 'lang_mi', 'lang_mk', 'lang_ml', 'lang_mn', 'lang_mo', 'lang_mr', 'lang_ms', 'lang_mt', 'lang_my', 'lang_nb', 'lang_ne', 'lang_nl', 'lang_no', 'lang_ny', 'lang_oc', 'lang_pa', 'lang_pl', 'lang_ps', 'lang_pt', 'lang_qu', 'lang_ro', 'lang_ru', 'lang_rw', 'lang_se', 'lang_sh', 'lang_si', 'lang_sk', 'lang_sl', 'lang_so', 'lang_sq', 'lang_sr', 'lang_st', 'lang_su', 'lang_sv', 'lang_sw', 'lang_ta', 'lang_te', 'lang_tg', 'lang_th', 'lang_tl', 'lang_tn', 'lang_tr', 'lang_uk', 'lang_ur', 'lang_uz', 'lang_vi', 'lang_wo', 'lang_xh', 'lang_xx', 'lang_yi', 'lang_zh', 'lang_zu']
compteur = 0 
for element in liste : 
    print(compteur, element)
    compteur+=1

0 startYear
1 runtimeMinutes
2 numVotes
3 averageRating
4 genre_Action
5 genre_Adventure
6 genre_Animation
7 genre_Biography
8 genre_Comedy
9 genre_Crime
10 genre_Documentary
11 genre_Drama
12 genre_Family
13 genre_Fantasy
14 genre_Film-Noir
15 genre_History
16 genre_Horror
17 genre_Music
18 genre_Musical
19 genre_Mystery
20 genre_News
21 genre_Romance
22 genre_Sci-Fi
23 genre_Sport
24 genre_Thriller
25 genre_Unknown
26 genre_War
27 genre_Western
28 lang_af
29 lang_am
30 lang_ar
31 lang_as
32 lang_ay
33 lang_az
34 lang_be
35 lang_bg
36 lang_bm
37 lang_bn
38 lang_bo
39 lang_bs
40 lang_ca
41 lang_ce
42 lang_cn
43 lang_cs
44 lang_cy
45 lang_da
46 lang_de
47 lang_dz
48 lang_el
49 lang_en
50 lang_eo
51 lang_es
52 lang_et
53 lang_eu
54 lang_fa
55 lang_ff
56 lang_fi
57 lang_fr
58 lang_ga
59 lang_gl
60 lang_gu
61 lang_ha
62 lang_he
63 lang_hi
64 lang_hr
65 lang_ht
66 lang_hu
67 lang_hy
68 lang_id
69 lang_is
70 lang_it
71 lang_iu
72 lang_ja
73 lang_jv
74 lang_ka
75 lang_kk
76 lang_kl
77 lang_km

In [168]:
#test d'augmentation des paramètres de genre
cols_indices = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23,24]


X_scaled[:, cols_indices] *= 5


In [169]:
#test des métriques et du nombre de voisins
from sklearn.neighbors import NearestNeighbors
# Création et ajustement du modèle NearestNeighbors
nn = NearestNeighbors(n_neighbors=60, metric='euclidean')
nn.fit(X_scaled)

In [170]:
df_test

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genre_imdb,frenchTitle,averageRating,numVotes,original_language,production_countries,nconst
0,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],
1,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],
2,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],
3,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],
4,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90.0,Drama,L'enfant prodigue,5.6,31.0,fr,['FR'],
...,...,...,...,...,...,...,...,...,...,...,...,...
1978367,tt9916362,Coven,Akelarre,2020,92.0,History,Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",
1978368,tt9916362,Coven,Akelarre,2020,92.0,Drama,Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",
1978369,tt9916362,Coven,Akelarre,2020,92.0,History,Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",
1978370,tt9916362,Coven,Akelarre,2020,92.0,Drama,Les sorcières d'Akelarre,6.4,5990.0,eu,"['AR', 'ES', 'FR']",


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialisation d'une liste pour stocker les résultats
films_similaires = []

# Recherche des films similaires
film_index = df_aggregated[df_aggregated['tconst'] == 'tt1201607'].index[0]
distances, indices = nn.kneighbors(X_scaled[film_index].reshape(1, -1))

# Récupération du titre du film de référence
film_reference = df_test[df_test['tconst'] == df_aggregated.iloc[film_index]['tconst']]['frenchTitle'].iloc[0]

print(f"Films les plus similaires pour {film_reference}:")

for distance, index in zip(distances[0][1:], indices[0][1:]):  # On exclut le premier (lui-même)
    tconst_similaire = df_aggregated['tconst'].iloc[index]
    title_similaire = df_test[df_test['tconst'] == tconst_similaire]['frenchTitle'].values[0]

    films_similaires.append({
        "tconst": tconst_similaire, 
        "frenchTitle": title_similaire, 
        "distance": distance
    })

# Création du DataFrame
df_resultats = pd.DataFrame(films_similaires)

# Calcul du score de similarité cosinus basé sur les titres
titles = [film_reference] + df_resultats['frenchTitle'].tolist()
vectorizer = TfidfVectorizer().fit_transform(titles)
similarity_matrix = cosine_similarity(vectorizer)

# Récupérer les scores de similarité (par rapport au film de référence)
similarity_scores = similarity_matrix[0][1:]  # Ignorer l'élément 0 (lui-même)

# Ajouter la similarité au DataFrame
df_resultats["similarité"] = similarity_scores
df_resultats['similarité'] = df_resultats['similarité'].apply(lambda x: 0 if x < 0.25 else x)

# Trier par similarité décroissante
df_resultats = df_resultats.sort_values(by=["similarité", "distance"], ascending=[False,True])

# Afficher les 10 premiers résultats
display(df_resultats.head(10))

Films les plus similaires pour Harry Potter et les Reliques de la Mort : partie 2:


Unnamed: 0,tconst,frenchTitle,distance,similarité
0,tt0926084,Harry Potter et les Reliques de la Mort : part...,68.778691,1.0
4,tt0330373,Harry Potter et la Coupe de feu,104.839978,0.448913
3,tt0295297,Harry Potter et la Chambre des secrets,88.109046,0.35911
1,tt0417741,Harry Potter et le Prince de sang-mêlé,77.639459,0.314419
5,tt0304141,Harry Potter et le Prisonnier d'Azkaban,116.466599,0.288581
6,tt0373889,Harry Potter et l'Ordre du Phénix,127.187933,0.278963
13,tt2771200,La Belle et la Bête,164.505869,0.270555
2,tt0241527,Harry Potter à l'école des sorciers,88.049984,0.0
7,tt3183660,Les Animaux fantastiques,130.539019,0.0
8,tt4123430,Les Animaux fantastiques : Les Crimes de Grind...,135.284295,0.0


In [172]:
df_resultats

Unnamed: 0,tconst,frenchTitle,distance,similarité
0,tt0926084,Harry Potter et les Reliques de la Mort : part...,68.778691,1.0
4,tt0330373,Harry Potter et la Coupe de feu,104.839978,0.448913
3,tt0295297,Harry Potter et la Chambre des secrets,88.109046,0.35911
1,tt0417741,Harry Potter et le Prince de sang-mêlé,77.639459,0.314419
5,tt0304141,Harry Potter et le Prisonnier d'Azkaban,116.466599,0.288581
6,tt0373889,Harry Potter et l'Ordre du Phénix,127.187933,0.278963
13,tt2771200,La Belle et la Bête,164.505869,0.270555
2,tt0241527,Harry Potter à l'école des sorciers,88.049984,0.0
7,tt3183660,Les Animaux fantastiques,130.539019,0.0
8,tt4123430,Les Animaux fantastiques : Les Crimes de Grind...,135.284295,0.0


In [173]:
df_aggregated.to_csv('aggregated.csv')

In [174]:
#Code pour recherche le titre d'un ou plusieurs films
df_aggregated[(df_aggregated['frenchTitle'].str.contains('harry', case=False, na=False))|(df_aggregated['frenchTitle'].str.contains('ffq<fqzfzq', case=False, na=False))].sort_values('numVotes', ascending=False).head(7)

Unnamed: 0,tconst,frenchTitle,startYear,runtimeMinutes,numVotes,averageRating,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,genre_Film-Noir,genre_History,genre_Horror,genre_Music,genre_Musical,genre_Mystery,genre_News,genre_Romance,genre_Sci-Fi,genre_Sport,genre_Thriller,genre_Unknown,genre_War,genre_Western,lang_af,lang_am,lang_ar,lang_as,lang_ay,lang_az,lang_be,lang_bg,lang_bm,lang_bn,lang_bo,lang_bs,lang_ca,lang_ce,lang_cn,lang_cs,lang_cy,lang_da,lang_de,lang_dz,lang_el,lang_en,lang_eo,lang_es,lang_et,lang_eu,lang_fa,lang_ff,lang_fi,lang_fr,lang_ga,lang_gl,lang_gu,lang_ha,lang_he,lang_hi,lang_hr,lang_ht,lang_hu,lang_hy,lang_id,lang_is,lang_it,lang_iu,lang_ja,lang_jv,lang_ka,lang_kk,lang_kl,lang_km,lang_kn,lang_ko,lang_ks,lang_ku,lang_ky,lang_la,lang_lb,lang_lo,lang_lt,lang_lv,lang_mg,lang_mi,lang_mk,lang_ml,lang_mn,lang_mo,lang_mr,lang_ms,lang_mt,lang_my,lang_nb,lang_ne,lang_nl,lang_no,lang_ny,lang_oc,lang_pa,lang_pl,lang_ps,lang_pt,lang_qu,lang_ro,lang_ru,lang_rw,lang_se,lang_sh,lang_si,lang_sk,lang_sl,lang_so,lang_sq,lang_sr,lang_st,lang_su,lang_sv,lang_sw,lang_ta,lang_te,lang_tg,lang_th,lang_tl,lang_tn,lang_tr,lang_uk,lang_ur,lang_uz,lang_vi,lang_wo,lang_xh,lang_xx,lang_yi,lang_zh,lang_zu,_,_nm0000035,_nm0000093,_nm0000113,_nm0000115,_nm0000116,_nm0000123,_nm0000128,_nm0000129,_nm0000134,_nm0000136,_nm0000138,_nm0000139,_nm0000146,_nm0000148,_nm0000151,_nm0000158,_nm0000168,_nm0000179,_nm0000184,_nm0000191,_nm0000194,_nm0000198,_nm0000199,_nm0000204,_nm0000206,_nm0000217,_nm0000226,_nm0000229,_nm0000233,_nm0000243,_nm0000244,_nm0000246,_nm0000255,_nm0000288,_nm0000307,_nm0000318,_nm0000323,_nm0000332,_nm0000353,_nm0000354,_nm0000355,_nm0000375,_nm0000384,_nm0000399,_nm0000401,_nm0000422,_nm0000437,_nm0000474,_nm0000553,_nm0000631,_nm0000686,_nm0000705,_nm0000881,_nm0000949,_nm0000982,_nm0000988,_nm0001060,_nm0001392,_nm0001401,_nm0001426,_nm0001570,_nm0001741,_nm0001774,_nm0001804,_nm0001873,_nm0001877,_nm0001937,_nm0001980,_nm0002353,_nm0002354,_nm0003911,_nm0004056,_nm0004170,_nm0004581,_nm0004874,_nm0004937,_nm0004976,_nm0005023,_nm0005086,_nm0005212,_nm0005271,_nm0005351,_nm0005428,_nm0006035,_nm0006133,_nm0006290,_nm0006293,_nm0006904,_nm0009190,_nm0010736,_nm0032696,_nm0061045,_nm0065100,_nm0085312,_nm0089217,_nm0117290,_nm0149556,_nm0177896,_nm0185819,_nm0225146,_nm0230032,_nm0252230,_nm0252961,_nm0254645,_nm0262635,_nm0269463,_nm0270559,_nm0290556,_nm0315974,_nm0326040,_nm0330428,_nm0331516,_nm0342488,_nm0350453,_nm0362766,_nm0382268,_nm0413011,_nm0413168,_nm0424060,_nm0432725,_nm0454752,_nm0456158,_nm0460141,_nm0462895,_nm0476064,_nm0498278,_nm0517589,_nm0550881,_nm0564215,_nm0586969,_nm0605775,_nm0634240,_nm0649460,_nm0650038,_nm0662748,_nm0670408,_nm0694173,_nm0695435,_nm0705356,_nm0719637,_nm0736622,_nm0744429,_nm0746273,_nm0746830,_nm0748784,_nm0749263,_nm0757855,_nm0785227,_nm0795682,_nm0799777,_nm0811583,_nm0835016,_nm0836121,_nm0842770,_nm0858799,_nm0876138,_nm0914612,_nm1046097,_nm1055413,_nm1165110,_nm1176985,_nm1209966,_nm1212722,_nm1297015,_nm1334526,_nm1706767,_nm1727304,_nm2003463,_nm2225369,_nm2273444,_nm2554352,_nm2933757,_nm3234869
46449,tt1201607,Harry Potter et les Reliques de la Mort : part...,2011,130.0,982761.0,8.1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
35512,tt0241527,Harry Potter à l'école des sorciers,2001,152.0,891829.0,7.7,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37270,tt0295297,Harry Potter et la Chambre des secrets,2002,161.0,719140.0,7.4,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37545,tt0304141,Harry Potter et le Prisonnier d'Azkaban,2004,142.0,718799.0,7.9,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38241,tt0330373,Harry Potter et la Coupe de feu,2005,157.0,707460.0,7.7,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39139,tt0373889,Harry Potter et l'Ordre du Phénix,2007,138.0,657212.0,7.5,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42897,tt0926084,Harry Potter et les Reliques de la Mort : part...,2010,146.0,622182.0,7.7,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Pickle

In [175]:
with open('model.pickle', 'wb') as f:
    pickle.dump(nn, f)

In [176]:
nn.kneighbors(return_distance=False)

array([[  215, 34304,   157, ...,  1167, 30584, 33236],
       [ 3169,  6208,  4529, ..., 34812, 31137, 22818],
       [   49,    76,    60, ...,   973,   403,   968],
       ...,
       [55885, 58206, 60144, ..., 39419, 56967, 10729],
       [19954, 63325, 11759, ..., 26010, 38484, 30928],
       [64229, 68719, 26081, ..., 26393,  7596, 10200]], shape=(69608, 60))