# Préparation pour le Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib
import warnings
warnings.filterwarnings("ignore") 
import pickle

df_movies = joblib.load("..\BD_A_IGNORE\df_movies.pkl")
df_movies.head(3)

  df_movies = joblib.load("..\BD_A_IGNORE\df_movies.pkl")


Unnamed: 0,level_0,index,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,...,genre_Fantasy,genre_Game-Show,genre_Unknown,genre_Drama,genre_Horror,genre_Adventure,genre_Animation,genre_Talk-Show,genre_Mystery,genre_Music
459,459,2415,tt0002423,movie,Madame DuBarry,False,1919,113.0,"Biography,Drama,Romance",nm0523932,...,0,0,0,1,0,0,0,0,0,0
505,505,2638,tt0002646,movie,Atlantis,False,1913,121.0,Drama,nm0088881,...,0,0,0,1,0,0,0,0,0,0
802,802,4962,tt0004972,movie,The Birth of a Nation,False,1915,195.0,"Drama,War",nm0000428,...,0,0,0,1,0,0,0,0,0,0


In [2]:
inception = df_movies[df_movies['originalTitle'] == "Inception"]
inception

Unnamed: 0,level_0,index,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,...,genre_Fantasy,genre_Game-Show,genre_Unknown,genre_Drama,genre_Horror,genre_Adventure,genre_Animation,genre_Talk-Show,genre_Mystery,genre_Music
138644,138644,3066705,tt1375666,movie,Inception,False,2010,148.0,"Action,Adventure,Sci-Fi",nm0634240,...,0,0,0,0,0,1,0,0,0,0


In [3]:
inception['actors_rank']

138644    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Name: actors_rank, dtype: object

# Base données pour ML

#### Encodage des colonnes pour entrainement

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # limite à 5000 mots pour éviter trop de dimensions
tfidf_matrix = tfidf_vectorizer.fit_transform(df_movies['overview'])
tfidf_matrix.shape


(104263, 5000)

#### Choix des variables et initialisation 

In [None]:
from scipy.sparse import hstack

features = ['runtimeMinutes', 'averageRating', 'numVotes', 'popularity', 'budget'] + \
           [col for col in df_movies.columns if col.startswith('genre_')]

with open("features_list.pkl", "wb") as f:
    pickle.dump(features, f)


# encodage des valeurs numériques
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_movies[features])

X_final = hstack([X_scaled, tfidf_matrix]) # fusionner les caractéristiques numériques et la matrice TF-IDF
X_final.shape

(104263, 5033)

# Créer et entraîner le modèle KNN

In [6]:
knn = NearestNeighbors(n_neighbors=10, metric='cosine')  # On utilise la similarité cosinus pour le texte
knn.fit(X_final)

# Fonction de recommandation des films

In [7]:
def recommander_films(film_titre):
    """
    Trouve des films similaires à celui donné en entrée en utilisant KNN.
    
    - film_titre : titre du film de référence
    - df_movies : DataFrame contenant les informations des films
    - df_movies_new : DataFrame utilisé pour entraîner le modèle KNN
    - features : colonnes utilisées pour KNN
    - scaler : StandardScaler entraîné
    - tfidf_vectorizer : modèle TF-IDF entraîné
    - knn : modèle KNN entraîné
    """
    
    # vérifier si le film est bien dans la base
    if film_titre not in df_movies['originalTitle'].values:
        return "Film non trouvé dans la base."

    try:
        # Trouver l’index du film dans df_movies
        film_index_movies = df_movies[df_movies['originalTitle'] == film_titre].index[0]

        # Extraire ses caractéristiques et standardiser
        film_features = df_movies.loc[film_index_movies, features].values.reshape(1, -1)
        film_features_scaled = scaler.transform(film_features)

        # Transformer `overview` en vecteur TF-IDF
        film_overview = df_movies.loc[film_index_movies, 'overview']
        film_overview_tfidf = tfidf_vectorizer.transform([film_overview])

        # Fusionner les caractéristiques numériques et le TF-IDF
        film_vector = hstack([film_features_scaled, film_overview_tfidf])

        # Trouver les films les plus proches
        distances, indices = knn.kneighbors(film_vector)

        # Récupérer les films recommandés
        recommandations = df_movies.iloc[indices[0][1:]]  # Exclure le film lui-même

        # Retourner les films avec plus d'infos (titre, période, genres, rating, popularité)
        return recommandations[['originalTitle', 'periode', 'averageRating', 'popularity'] + [col for col in df_movies.columns if col.startswith('genre_')]]

    except Exception as e:
        return f"Erreur lors de la recommandation : {e}"


In [8]:
joblib.dump(knn, "../BD_A_IGNORE/modele_knn.pkl")

['../BD_A_IGNORE/modele_knn.pkl']

In [9]:
joblib.dump(scaler, "../BD_A_IGNORE/scaler.pkl")

['../BD_A_IGNORE/scaler.pkl']

In [10]:
joblib.dump(tfidf_vectorizer, "../BD_A_IGNORE/tfidf_vectorizer.pkl")

['../BD_A_IGNORE/tfidf_vectorizer.pkl']

In [11]:
verif = df_movies[df_movies['originalTitle'] == 'The Last Tycoon']
verif

Unnamed: 0,level_0,index,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,...,genre_Fantasy,genre_Game-Show,genre_Unknown,genre_Drama,genre_Horror,genre_Adventure,genre_Animation,genre_Talk-Show,genre_Mystery,genre_Music
38220,38220,74605,tt0074777,movie,The Last Tycoon,False,1976,123.0,"Drama,Romance",nm0001415,...,0,0,0,1,0,0,0,0,0,0


In [12]:
recommander_films("Barbie")

Unnamed: 0,originalTitle,periode,averageRating,popularity,genre_Romance,genre_Crime,genre_Adult,genre_Musical,genre_Sport,genre_War,...,genre_Fantasy,genre_Game-Show,genre_Unknown,genre_Drama,genre_Horror,genre_Adventure,genre_Animation,genre_Talk-Show,genre_Mystery,genre_Music
105221,Life of Pi,2010,7.9,30.944,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
104918,Wonder Woman,2010,7.3,54.481,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
59594,Star Wars: Episode II - Attack of the Clones,2000,6.6,36.75,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
128731,Doctor Strange,2010,7.5,81.514,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
163326,Thor: The Dark World,2010,6.7,64.634,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
59394,Star Wars: Episode I - The Phantom Menace,1990,6.5,41.022,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
59595,Star Wars: Episode III - Revenge of the Sith,2000,7.6,39.515,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
139288,Suicide Squad,2010,5.9,36.86,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
59202,The Mummy,1990,7.1,60.589,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
