Projet DATA Sciences Sorbonne Université

1. On importe les bilbiothèques nécessaires 

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import ast
import utils

import time

2. Opérations de base : chargement des données et affichage de la structure et des premières valeurs

In [12]:
data = pd.read_csv('tmdb_5000_movies.csv')

# On affiche la première lignz du fichier
print(data.head(1))

# On affiche la structure du fichier 
print(data.info())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   

  original_title                                           overview  \
0         Avatar  In the 22nd century, a paraplegic Marine is di...   

   popularity                               production_companies  \
0  150.437577  [{"name": "Ingenious Film Partners", "id": 289...   

                                production_countries release_date     revenue  \
0  [{"iso_3166_1": "US", "name": "United States o...   2009-12-10  2787965087   

   runtime                                   spoken_languages    status  \
0    162.0  [{"iso_639_1": "en", "name": "English"}, {"iso...  Released   

                       tagline   title 

3. On nettoie les données que nous avons : colonnes inutiles

In [13]:
data = data.drop(columns=['homepage', 'id', 'overview', 'status', 'tagline', 'title'])

# Appliquer la fonction aux colonnes cibles
columns_to_clean = ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
for column in columns_to_clean:
    data[column] = data[column].apply(utils.extract_names)

#On affiche de nouveau le fichier update
print(data.head(1))

      budget                                         genres  \
0  237000000  [Action, Adventure, Fantasy, Science Fiction]   

                                            keywords original_language  \
0  [culture clash, future, space war, space colon...                en   

  original_title  popularity  \
0         Avatar  150.437577   

                                production_companies  \
0  [Ingenious Film Partners, Twentieth Century Fo...   

                         production_countries release_date     revenue  \
0  [United States of America, United Kingdom]   2009-12-10  2787965087   

   runtime    spoken_languages  vote_average  vote_count  
0    162.0  [English, Español]           7.2       11800  


4. On prépare les données pour l'analyse et la modélisation en utilisant TfidVectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combiner les colonnes textuelles en une seule colonne
data['text'] = data['original_title'] + ' ' + \
               data['genres'].apply(' '.join) + ' ' + \
               data['keywords'].apply(' '.join) + ' ' + \
               data['production_companies'].apply(' '.join) + ' ' + \
               data['production_countries'].apply(' '.join) + ' ' + \
               data['spoken_languages'].apply(' '.join)

data['text'] = data['text'].fillna('')

# init de TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000, max_df=0.5, stop_words='english')

start_time = time.time()
X_tfidf = vectorizer.fit_transform(data['text'])
end_time = time.time()

print(f"Temps de vectorisation: {end_time - start_time} secondes")

# on transforme le tout en Dataframe pour rendre les opérations dessus et l'affichage plus simple
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df.head())

Temps de vectorisation: 0.0841379165649414 secondes
   1492  1960s  1970s      19th  2000   21        3d   40     abuse  accident  \
0   0.0    0.0    0.0  0.000000   0.0  0.0  0.137738  0.0  0.000000       0.0   
1   0.0    0.0    0.0  0.000000   0.0  0.0  0.000000  0.0  0.210632       0.0   
2   0.0    0.0    0.0  0.000000   0.0  0.0  0.000000  0.0  0.000000       0.0   
3   0.0    0.0    0.0  0.000000   0.0  0.0  0.000000  0.0  0.000000       0.0   
4   0.0    0.0    0.0  0.241814   0.0  0.0  0.184104  0.0  0.000000       0.0   

   ...  ελληνικά   ית  العربية  ภาษาไทย  广州话  廣州話  日本語  普通话  조선말  한국어  
0  ...       0.0  0.0      0.0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  ...       0.0  0.0      0.0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  ...       0.0  0.0      0.0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  ...       0.0  0.0      0.0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  ...       0.0  0.0      0.0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 1000 columns]


5. On entraine le modèle et on récupère son taux d'erreur

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# on ajoute des features
features = pd.concat([data[['budget', 'popularity', 'runtime']], tfidf_df], axis=1)

# ce qu'on cherche à obetnir
target = data['vote_average']

# on divise les données en test et train
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# on entraine le modèle
model = RandomForestRegressor()
model.fit(X_train, y_train)

# prédictions
y_pred = model.predict(X_test)

# évalutation du modèle
mse = mean_squared_error(y_test, y_pred)
print(f"Taux d'erreur : {mse}")

Taux d'erreur : 0.7524818595213318


6. On effectue le clustering avec MiniBatchKMeans

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans

# Standardiser les features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Paramètres pour MiniBatchKMeans
n_clusters = 10  # Par exemple, 10 clusters
init_size = 1000  # Par exemple, 1000 observations pour initialiser les centres

# Initialiser MiniBatchKMeans
mini_batch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1, init_size=init_size, batch_size=500, random_state=42)

# Mesurer le temps de clustering
start_time = time.time()

# Ajuster le modèle sur les données
mini_batch_kmeans.fit(features_scaled)

end_time = time.time()
print(f"Temps de clustering: {end_time - start_time} secondes")

# Ajouter les labels de clusters aux données d'origine
data['cluster'] = mini_batch_kmeans.labels_

ValueError: Input X contains NaN.
MiniBatchKMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values