In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split
import mlflow
import mlflow.sklearn



# define paths to data
data_path = "data/"
data_path_tsv = "{}IMDB/".format(data_path)
data_path_csv = "{}Movie_Lens/ml-20m/".format(data_path)

# ouverture des fichiers
def open_IMDB_data(tsv_file, data_path_tsv = "data/IMDB/"):
    return pd.read_csv('{path}{file}.tsv.gz'.format(path = data_path_tsv, file = tsv_file), compression='gzip',sep = '\t', na_values = '\\N', low_memory = False)

def open_Movie_lens_data(csv_file, data_path_csv = "data/Movie_Lens/ml-20m/"):
    return pd.read_csv('{path}{file}.csv'.format(path = data_path_csv, file = csv_file))

title_ratings = open_IMDB_data(tsv_file='title.ratings')
title_crew = open_IMDB_data(tsv_file='title.crew')
title_basics = open_IMDB_data(tsv_file='title.basics')
title_principals = open_IMDB_data(tsv_file='title.principals')

ratings = open_Movie_lens_data('ratings')
links = open_Movie_lens_data('links')
movies = open_Movie_lens_data('movies')

# suppression des catégories de personnes avec beaucoup de NaNs
def drop_category():
    title_principals.drop(title_principals[title_principals['category'] == 'self'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'cinematographer'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'producer'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'composer'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'editor'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'production_designer'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'archive_footage'].index, inplace = True)
    title_principals.drop(title_principals[title_principals['category'] == 'archive_sound'].index, inplace = True)
    return title_principals

title_principals = drop_category()



    

# modifier title_principals pour avoir une colonne par catégories
def merge_category():
    # grouper les id des personnes par films et par leur rôle dans le film
    global title_principals
    title_principals = title_principals.groupby(['tconst','category']).agg({'nconst' : lambda x: ' '.join(x)}).reset_index()
    
    # créer des df par rôle
    for category in title_principals['category'].unique():
        globals()[category] = title_principals.groupby(by = ['category']).get_group(category).rename(columns={'nconst' : category}).drop('category', axis = 1)
    
    # merger les différents par rôle afin d'avoir un rôle par colonne
    title_principals_new = globals()[title_principals['category'].unique()[0]].merge(globals()[title_principals['category'].unique()[1]], how = 'outer', on = 'tconst')
    for i in range(2,len(title_principals['category'].unique())):
        title_principals_new = title_principals_new.merge(globals()[title_principals['category'].unique()[i]], how = 'outer', on = 'tconst')
    return title_principals_new



    

title_principals = merge_category()

# merge entre df_imdb et df_movie_lens
def merge_data():
    df_imdb = title_ratings.merge(right = title_crew, 
                                  how = 'inner',
                                  on = 'tconst').merge(right = title_basics,
                                                       how = 'inner',
                                                       on = 'tconst').merge(right = title_principals,
                                                                            how = 'inner',
                                                                            on = 'tconst')
    
    df_movie_lens = ratings.merge(right = movies, how = 'inner', on = 'movieId').merge(right = links, how = 'inner', on = 'movieId')
    return df_imdb, df_movie_lens

df_imdb,df_movie_lens = merge_data()


def preprocessing_data():
    # suppression des colonnes tmdbId et timestamp
    df_movie_lens.drop(['tmdbId', 'timestamp'], axis = 1, inplace = True)
    
    # remplacer les valeurs de tconst afin que ça soit fusionnable avec le df_movie_lens
    df_imdb['tconst'].replace({'tt':''}, regex= True,inplace = True)
    df_imdb['tconst'] = df_imdb['tconst'].astype('int')
    
    # renommer la colonne tconst en imdbId pour la fusion ci-après
    df_imdb.rename(columns= {'tconst' : 'imdbId'}, inplace = True)
    
    # merge df_movie_lens et df_imdb
    df_merged = df_movie_lens.merge(right = df_imdb, how = 'right', on = 'imdbId')

    # regrouper les colonnes actor et actress en une seule et remplacer les cases vides par des NaN
    df_merged['actors'] = df_merged['actor'].str.cat(df_merged['actress'],na_rep = '', sep=' ')
    df_merged.replace({' ' : np.nan}, inplace = True)
    
    #suppression des colonnes inutiles, des données manquantes
    df_merged.drop(columns = ['endYear','title','originalTitle','genres_x','isAdult','actor','actress', 'directors', 'writers'],inplace = True)
    #df_merged.dropna(inplace = True)
    
    
    #remplacement des ',' par des espaces afin d'utiliser ci-après la fonction tfid vectorizer
    df_merged['genres_y'].replace({',':' '}, regex = True, inplace = True)
    #retourner les films uniquement à partir des années 2000 (réduire la base de données)
    return df_merged[(df_merged['titleType']=='movie') & (df_merged['startYear']>=2000) & (df_merged['numVotes']>1000000)]

df_merged = preprocessing_data()

In [11]:
df_merged[df_merged['numVotes']>1000000].shape

(1469445, 14)