# Préparation pour le Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

df_movies = pd.read_csv("../BD_A_IGNORE/movies.csv", sep=',', index_col=0)
df_movies = df_movies.drop(columns=["level_0", "index"])
df_movies = df_movies[df_movies['titleType'] == "movie"]
df_movies.head(3)

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,averageRating,...,spoken_languages,vote_average,vote_count,actors_name,actors_rank,directors_name,writers_name,producers_name,cinematographers_name,editors_name
91,tt0000574,movie,The Story of the Kelly Gang,False,1906,70.0,"Action,Adventure,Biography",nm0846879,nm0846879,6.0,...,['xx'],5.375,28,"['Elizabeth Tait', 'Bella Cola', 'Charles Tait']","[1, 6, 11]",['Charles Tait'],['Charles Tait'],"['W.A. Gibson', 'Millard Johnson', 'John Tait'...","['Millard Johnson', 'Orrie Perry', 'Reg Perry']",Unknown
97,tt0000591,movie,L'enfant prodigue,False,1907,90.0,Drama,nm0141150,nm0141150,5.7,...,['xx'],0.0,0,"['Christiane Mandelys', 'Gilberte Sergy', 'Mic...","[3, 4, 5]",['Michel Carré'],['Michel Carré'],Unknown,Unknown,Unknown
241,tt0001184,movie,Don Juan de Serrallonga,False,1910,58.0,"Adventure,Drama","nm0063413,nm0550220",nm0049370,3.8,...,['xx'],1.0,1,"['Dolores Puchol', 'Ricardo de Baños', 'Albert...","[1, 3, 4]","['Ricardo de Baños', 'Alberto Marro']",['Víctor Balaguer'],Unknown,['Ramón de Baños'],Unknown


In [2]:
import joblib
joblib.dump(df_movies, "df_movies.pkl")

['df_movies.pkl']

In [12]:
df_movies['titleType'].value_counts()

titleType
movie    191270
Name: count, dtype: int64

In [13]:
# informations générales de la base
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191270 entries, 91 to 235135
Data columns (total 30 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   tconst                 191270 non-null  object 
 1   titleType              191270 non-null  object 
 2   originalTitle          191270 non-null  object 
 3   isAdult                191270 non-null  bool   
 4   startYear              191270 non-null  int64  
 5   runtimeMinutes         191270 non-null  float64
 6   genres                 191270 non-null  object 
 7   directors              191270 non-null  object 
 8   writers                191270 non-null  object 
 9   averageRating          191270 non-null  float64
 10  numVotes               191270 non-null  float64
 11  periode                191270 non-null  int64  
 12  budget                 191270 non-null  int64  
 13  id                     191270 non-null  int64  
 14  original_language      191270 non-null  

In [14]:
# check valeurs manquantes
df_movies.isna().sum()

tconst                   0
titleType                0
originalTitle            0
isAdult                  0
startYear                0
runtimeMinutes           0
genres                   0
directors                0
writers                  0
averageRating            0
numVotes                 0
periode                  0
budget                   0
id                       0
original_language        0
overview                 0
popularity               0
poster_path              0
production_countries     0
revenue                  0
spoken_languages         0
vote_average             0
vote_count               0
actors_name              0
actors_rank              0
directors_name           0
writers_name             0
producers_name           0
cinematographers_name    0
editors_name             0
dtype: int64

In [15]:
# descriptif statistiques
df_movies.describe()

Unnamed: 0,startYear,runtimeMinutes,averageRating,numVotes,periode,budget,id,popularity,revenue,vote_average,vote_count
count,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0,191270.0
mean,1994.195399,94.9724,6.022176,5985.674,1989.746798,1379140.0,354951.1,3.511111,3615819.0,4.501696,100.6378
std,26.603984,21.916737,1.218017,46927.97,26.836747,10340810.0,297889.2,26.745629,38323560.0,2.743128,744.444926
min,1903.0,1.0,1.0,5.0,1910.0,0.0,2.0,0.6,0.0,0.0,0.0
25%,1977.0,83.0,5.3,46.0,1970.0,0.0,98328.75,0.731,0.0,2.4,1.0
50%,2005.0,92.0,6.2,177.0,2000.0,0.0,280516.5,1.365,0.0,5.4,4.0
75%,2016.0,104.0,6.8,793.0,2010.0,0.0,536473.8,2.655,0.0,6.5,14.0
max,2025.0,240.0,10.0,2959184.0,2020.0,579330400.0,1122413.0,5089.969,2923706000.0,10.0,33630.0


## Transformation de la colonne "genres" : extraire les genres uniques et les transformer en colonnes booléenes

#### Fonction transformant la chaine de films en liste et création de la colonne genres_liste

In [33]:
def split_chaine_en_liste(x):
    if isinstance(x, str): # si x est une chaine de caractère, 
        return x.split(',') # applique le split pour transformer la chaine en liste de plusieurs chaines de caractères
    else:
        return x # sinon retourne x tel quel


In [34]:
df_movies['genres_liste'] = df_movies['genres'].apply(split_chaine_en_liste)
df_movies['genres_liste']

91        [Action, Adventure, Biography]
97                               [Drama]
241                   [Adventure, Drama]
323                            [Unknown]
346                              [Drama]
                       ...              
235131                          [Family]
235132                     [Documentary]
235133     [Action, Adventure, Thriller]
235134                        [Thriller]
235135                  [Drama, History]
Name: genres_liste, Length: 191270, dtype: object

#### Extraction des genres uniques dans toutes les listes

In [35]:
# extraire les genres uniques 
tous_les_genres = set()
for genres in df_movies['genres_liste']:
  tous_les_genres.update(genres)

tous_les_genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Talk-Show',
 'Thriller',
 'Unknown',
 'War',
 'Western'}

#### Création d'autant de colonnes binaires que de genres uniques

In [36]:
import warnings
warnings.filterwarnings("ignore") 

# créer des colonnes binaires pour chaque genre unique
for genre in tous_les_genres:
    def genre_present(x):
        return int(genre in x)
    
    df_movies[f'genre_{genre}'] = df_movies['genres_liste'].apply(genre_present)

df_movies.columns

Index(['tconst', 'titleType', 'originalTitle', 'isAdult', 'startYear',
       'runtimeMinutes', 'genres', 'directors', 'writers', 'averageRating',
       'numVotes', 'periode', 'budget', 'id', 'original_language', 'overview',
       'popularity', 'poster_path', 'production_countries', 'revenue',
       'spoken_languages', 'vote_average', 'vote_count', 'actors_name',
       'actors_rank', 'directors_name', 'writers_name', 'producers_name',
       'cinematographers_name', 'editors_name', 'genres_liste', 'genre_Crime',
       'genre_Talk-Show', 'genre_Romance', 'genre_History', 'genre_Animation',
       'genre_Adult', 'genre_Family', 'genre_Musical', 'genre_Western',
       'genre_Mystery', 'genre_Biography', 'genre_Adventure', 'genre_Comedy',
       'genre_Sci-Fi', 'genre_War', 'genre_Music', 'genre_Drama',
       'genre_Reality-TV', 'genre_Sport', 'genre_Fantasy', 'genre_Game-Show',
       'genre_Horror', 'genre_Documentary', 'genre_Film-Noir', 'genre_Unknown',
       'genre_Thriller', 

In [37]:
df_movies.sample(5)

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,averageRating,...,genre_Sport,genre_Fantasy,genre_Game-Show,genre_Horror,genre_Documentary,genre_Film-Noir,genre_Unknown,genre_Thriller,genre_Action,genre_News
154138,tt1677595,movie,Ringu ringu ringu: Namida no chanpion beruto,False,1993,94.0,"Drama,Sport",nm0473791,nm0875337,6.5,...,1,0,0,0,0,0,0,0,0,0
152580,tt1637709,movie,O thanatos pou onireftika,False,2010,90.0,"Drama,Horror,Romance",nm0470384,"nm0434511,nm0470384",3.7,...,0,0,0,1,0,0,0,0,0,0
41263,tt0080834,movie,Hamesh Hamesh,False,1980,89.0,"Comedy,Musical",nm0408114,"nm0408114,nm3256772",6.2,...,0,0,0,0,0,0,0,0,0,0
112098,tt0867464,movie,Clean Break,False,2008,93.0,"Drama,Thriller",nm0539354,"nm0354413,nm2393651,nm0539354",4.2,...,0,0,0,0,0,0,0,1,0,0
108629,tt0493076,movie,Nina,False,2016,90.0,"Biography,Drama,Music",nm0607725,nm0607725,5.4,...,0,0,0,0,0,0,0,0,0,0


# Base données pour ML

In [38]:
df_movies_new = df_movies[['isAdult', 'runtimeMinutes', 'averageRating', 'numVotes', 'periode', 'budget', 'overview', 'popularity', 
                           'genre_Film-Noir', 'genre_Sci-Fi', 'genre_History', 'genre_Reality-TV', 'genre_News', 'genre_Comedy', 
                           'genre_Mystery', 'genre_Fantasy', 'genre_Family', 'genre_Crime', 'genre_Talk-Show',
                            'genre_Sport', 'genre_Animation', 'genre_Biography', 'genre_Adventure', 'genre_Unknown', 'genre_Game-Show', 
                            'genre_Adult', 'genre_Music', 'genre_War', 'genre_Romance', 'genre_Action', 'genre_Western', 'genre_Horror', 
                            'genre_Documentary', 'genre_Musical', 'genre_Drama', 'genre_Thriller']]

df_movies_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191270 entries, 91 to 235135
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   isAdult            191270 non-null  bool   
 1   runtimeMinutes     191270 non-null  float64
 2   averageRating      191270 non-null  float64
 3   numVotes           191270 non-null  float64
 4   periode            191270 non-null  int64  
 5   budget             191270 non-null  int64  
 6   overview           191270 non-null  object 
 7   popularity         191270 non-null  float64
 8   genre_Film-Noir    191270 non-null  int64  
 9   genre_Sci-Fi       191270 non-null  int64  
 10  genre_History      191270 non-null  int64  
 11  genre_Reality-TV   191270 non-null  int64  
 12  genre_News         191270 non-null  int64  
 13  genre_Comedy       191270 non-null  int64  
 14  genre_Mystery      191270 non-null  int64  
 15  genre_Fantasy      191270 non-null  int64  
 16  genre_

#### Encodage des colonnes pour entrainement

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # limite à 5000 mots pour éviter trop de dimensions
tfidf_matrix = tfidf_vectorizer.fit_transform(df_movies_new['overview'])
tfidf_matrix.shape


(191270, 5000)

In [40]:
from scipy.sparse import hstack

features = ['runtimeMinutes', 'averageRating', 'numVotes', 'popularity', 'budget'] + \
           [col for col in df_movies_new.columns if col.startswith('genre_')]

# encodage des valeurs numériques
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_movies_new[features])

X_final = hstack([X_scaled, tfidf_matrix]) # fusionner les caractéristiques numériques et la matrice TF-IDF
X_final.shape

(191270, 5033)

# Créer et entraîner le modèle KNN

In [41]:
knn = NearestNeighbors(n_neighbors=10, metric='cosine')  # On utilise la similarité cosinus pour le texte
knn.fit(X_final)

# Fonction de recommandation des films

In [42]:
def recommander_films(film_titre, df_movies, df_movies_new, features, scaler, tfidf_vectorizer, knn):
    """
    Trouve des films similaires à celui donné en entrée en utilisant KNN.
    
    - film_titre : titre du film de référence
    - df_movies : DataFrame contenant les informations des films
    - df_movies_new : DataFrame utilisé pour entraîner le modèle KNN
    - features : colonnes utilisées pour KNN
    - scaler : StandardScaler entraîné
    - tfidf_vectorizer : modèle TF-IDF entraîné
    - knn : modèle KNN entraîné
    """
    
    # 📌 Vérifier si le film est bien dans df_movies
    if film_titre not in df_movies['originalTitle'].values:
        return "Film non trouvé dans la base."

    # 📌 Trouver l’index du film dans df_movies
    film_index_movies = df_movies[df_movies['originalTitle'] == film_titre].index[0]

    # 📌 Vérifier que cet index existe dans df_movies_new
    if film_index_movies >= len(df_movies_new):
        return "Correspondance introuvable dans les données d'entraînement."

    # 📌 Extraire ses caractéristiques et standardiser
    film_features = df_movies_new.loc[film_index_movies, features].values.reshape(1, -1)
    film_features_scaled = scaler.transform(film_features)

    # 📌 Transformer `overview` en vecteur TF-IDF
    film_overview = df_movies.loc[film_index_movies, 'overview']
    film_overview_tfidf = tfidf_vectorizer.transform([film_overview])

    # 📌 Fusionner les caractéristiques numériques et le TF-IDF
    film_vector = hstack([film_features_scaled, film_overview_tfidf])

    # 📌 Trouver les films les plus proches
    distances, indices = knn.kneighbors(film_vector)

    # 📌 Récupérer les films recommandés
    recommandations = df_movies.iloc[indices[0][1:]]  # Exclure le film lui-même

    # 📌 Retourner les films avec plus d'infos (titre, période, genres, rating, popularité)
    return recommandations[['originalTitle', 'periode', 'averageRating', 'popularity'] + [col for col in df_movies.columns if col.startswith('genre_')]]


In [43]:
verif = df_movies[df_movies['originalTitle'] == 'The Last Tycoon']
verif

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,averageRating,...,genre_Sport,genre_Fantasy,genre_Game-Show,genre_Horror,genre_Documentary,genre_Film-Noir,genre_Unknown,genre_Thriller,genre_Action,genre_News
38220,tt0074777,movie,The Last Tycoon,False,1976,123.0,"Drama,Romance",nm0001415,"nm0280234,nm0056217",6.2,...,0,0,0,0,0,0,0,0,0,0


In [45]:
recommander_films("The Last Tycoon", df_movies, df_movies_new, features, scaler, tfidf_vectorizer, knn)

Unnamed: 0,originalTitle,periode,averageRating,popularity,genre_Crime,genre_Talk-Show,genre_Romance,genre_History,genre_Animation,genre_Adult,...,genre_Sport,genre_Fantasy,genre_Game-Show,genre_Horror,genre_Documentary,genre_Film-Noir,genre_Unknown,genre_Thriller,genre_Action,genre_News
42904,Identificazione di una donna,1980,6.7,3.769,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
176832,The Mistress,2010,6.4,2.126,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57629,Moll Flanders,1990,6.4,4.832,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68851,L'ennui,1990,6.0,6.937,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24259,Marjorie Morningstar,1950,6.2,1.954,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91869,AKA,2000,6.3,8.218,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
217696,Sultana Bibiana,2010,6.1,0.6,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124925,Your Love Song,2020,6.2,2.828,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59323,A Price Above Rubies,1990,6.5,4.437,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
