In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy

In [2]:
# Paths to the files
movies_path = 'ml-1m/movies.dat'
ratings_path = 'ml-1m/ratings.dat'
users_path = 'ml-1m/users.dat'

# Reading each file correctly
movies = pd.read_csv(movies_path, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')
ratings = pd.read_csv(ratings_path, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin1')
users = pd.read_csv(users_path, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='latin1')

# Displaying the first few rows of each dataframe
print("Movies Data:")
print(movies.head())

print("\nRatings Data:")
print(ratings.head())

print("\nUsers Data:")
print(users.head())


Movies Data:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings Data:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users Data:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


In [3]:
users.drop(columns=['Zip-code'], inplace=True)
ratings.drop(columns=['Timestamp'], inplace=True)

In [4]:
movies_ratings = pd.merge(movies, ratings, on='MovieID')
movielens_data = pd.merge(movies_ratings, users, on='UserID')

print(movielens_data.head())
print("taille : ", len(movielens_data))

   MovieID             Title                       Genres  UserID  Rating  \
0        1  Toy Story (1995)  Animation|Children's|Comedy       1       5   
1        1  Toy Story (1995)  Animation|Children's|Comedy       6       4   
2        1  Toy Story (1995)  Animation|Children's|Comedy       8       4   
3        1  Toy Story (1995)  Animation|Children's|Comedy       9       5   
4        1  Toy Story (1995)  Animation|Children's|Comedy      10       5   

  Gender  Age  Occupation  
0      F    1          10  
1      F   50           9  
2      M   25          12  
3      M   25          17  
4      F   35           1  
taille :  1000209


In [5]:
imdb_basics =  pd.read_csv('title.basics.tsv', delimiter='\t', low_memory=False)
imdb_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t', low_memory=False)
imdb_crew = pd.read_csv('title.crew.tsv', delimiter='\t', low_memory=False)

print("basics :")
print(imdb_basics.head())

print("\nratings :")
print(imdb_ratings.head())

print("\ncrew :")
print(imdb_crew.head())

basics :
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              5  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  

ratings :
      tconst  averageRating  numVotes
0  tt0000001            5.7      2059
1  tt0000002            5.6      

In [6]:
imdb_basics.drop(columns=['endYear'], inplace=True)
imdb_basics.drop(columns=['runtimeMinutes'], inplace=True)
imdb_basics.drop(columns=['isAdult'], inplace=True)
imdb_ratings.drop(columns=['numVotes'], inplace=True)
imdb_crew.drop(columns=['writers'], inplace=True)

In [7]:
# Merge basics with ratings and crew data
imdb_data = imdb_basics.merge(imdb_ratings, on='tconst').merge(imdb_crew, on='tconst')

print(imdb_data.head())
print("taille : ", len(imdb_data))

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  startYear                    genres  averageRating  directors  
0      1894         Documentary,Short            5.7  nm0005690  
1      1892           Animation,Short            5.6  nm0721526  
2      1892  Animation,Comedy,Romance            6.5  nm0721526  
3      1892           Animation,Short            5.4  nm0721526  
4      1893              Comedy,Short            6.2  nm0005690  
taille :  1438597


In [8]:
# Remplacer les valeurs '\\N' par NaN pour les colonnes numériques
imdb_data['startYear'] = imdb_data['startYear'].replace('\\N', np.nan).astype(float)
imdb_data['averageRating'] = imdb_data['averageRating'].replace('\\N', np.nan).astype(float)

# Remplacer les valeurs NaN par la moyenne de la colonne
imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)

# Remplacer les valeurs '\\N' par 'Unknown' pour les colonnes catégorielles
imdb_data['genres'] = imdb_data['genres'].replace('\\N', 'Unknown')
imdb_data['titleType'] = imdb_data['titleType'].replace('\\N', 'Unknown')
imdb_data['primaryTitle'] = imdb_data['primaryTitle'].replace('\\N', 'Unknown')
imdb_data['originalTitle'] = imdb_data['originalTitle'].replace('\\N', 'Unknown')
imdb_data['directors'] = imdb_data['directors'].replace('\\N', 'Unknown')

# Remplir les NaN dans startYear avec une valeur par défaut (par exemple, 0)
imdb_data['startYear'].fillna(0, inplace=True)

# Convertir la colonne startYear en type int
imdb_data['startYear'] = imdb_data['startYear'].astype(int)

# Vérifier que les valeurs '\\N' ont été remplacées
print(imdb_data[['startYear', 'averageRating', 'genres', 'titleType', 'primaryTitle', 'originalTitle', 'directors']].head())

# Vérifier les types de données
print(imdb_data.dtypes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['startYear'].fillna(0, inplace=True)


   startYear  averageRating                    genres titleType  \
0       1894            5.7         Documentary,Short     short   
1       1892            5.6           Animation,Short     short   
2       1892            6.5  Animation,Comedy,Romance     short   
3       1892            5.4           Animation,Short     short   
4       1893            6.2              Comedy,Short     short   

             primaryTitle           originalTitle  directors  
0              Carmencita              Carmencita  nm0005690  
1  Le clown et ses chiens  Le clown et ses chiens  nm0721526  
2          Pauvre Pierrot          Pauvre Pierrot  nm0721526  
3             Un bon bock             Un bon bock  nm0721526  
4        Blacksmith Scene        Blacksmith Scene  nm0005690  
tconst            object
titleType         object
primaryTitle      object
originalTitle     object
startYear          int64
genres            object
averageRating    float64
directors         object
dtype: object


In [9]:
# Strip and lowercase titles for better matching
movielens_data['CleanTitle'] = movielens_data['Title'].str.extract(r'^(.*?) \(\d{4}\)$')[0].str.lower().str.strip()
imdb_data['CleanTitle'] = imdb_data['primaryTitle'].str.lower().str.strip()

# Convert years to integers for exact matching
movielens_data['Year'] = movielens_data['Title'].str.extract(r'\((\d{4})\)')[0].astype(int)
imdb_data['Year'] = imdb_data['startYear'].astype(int)

In [10]:
merged_data = pd.merge(movielens_data, imdb_data, left_on=['CleanTitle', 'Year'], right_on=['CleanTitle', 'Year'])

In [11]:
merged_data['ScaledRating'] = merged_data['Rating'] * 2  # Scaling MovieLens ratings from 1-5 to 1-10
merged_data['CompositeRating'] = (merged_data['ScaledRating'] + merged_data['averageRating']) / 2

merged_data.drop(columns=['tconst', 'primaryTitle', 'originalTitle', 'startYear'], inplace=True)
merged_data.drop(columns=['averageRating', 'ScaledRating'], inplace=True)
print(merged_data.head())
print("taille : ", len(merged_data))

   MovieID             Title                       Genres  UserID  Rating  \
0        1  Toy Story (1995)  Animation|Children's|Comedy       1       5   
1        1  Toy Story (1995)  Animation|Children's|Comedy       6       4   
2        1  Toy Story (1995)  Animation|Children's|Comedy       8       4   
3        1  Toy Story (1995)  Animation|Children's|Comedy       9       5   
4        1  Toy Story (1995)  Animation|Children's|Comedy      10       5   

  Gender  Age  Occupation CleanTitle  Year titleType  \
0      F    1          10  toy story  1995     movie   
1      F   50           9  toy story  1995     movie   
2      M   25          12  toy story  1995     movie   
3      M   25          17  toy story  1995     movie   
4      F   35           1  toy story  1995     movie   

                       genres  directors  CompositeRating  
0  Adventure,Animation,Comedy  nm0005124             9.15  
1  Adventure,Animation,Comedy  nm0005124             8.15  
2  Adventure,Animati

In [12]:
merged_data.drop(['Title', 'Genres'], axis=1, inplace=True)

In [13]:
print(merged_data.head())
print("taille : ", len(merged_data))

   MovieID  UserID  Rating Gender  Age  Occupation CleanTitle  Year titleType  \
0        1       1       5      F    1          10  toy story  1995     movie   
1        1       6       4      F   50           9  toy story  1995     movie   
2        1       8       4      M   25          12  toy story  1995     movie   
3        1       9       5      M   25          17  toy story  1995     movie   
4        1      10       5      F   35           1  toy story  1995     movie   

                       genres  directors  CompositeRating  
0  Adventure,Animation,Comedy  nm0005124             9.15  
1  Adventure,Animation,Comedy  nm0005124             8.15  
2  Adventure,Animation,Comedy  nm0005124             8.15  
3  Adventure,Animation,Comedy  nm0005124             9.15  
4  Adventure,Animation,Comedy  nm0005124             9.15  
taille :  805610


In [14]:
# Encodage des genres
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(merged_data['genres'].str.split(','))
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_, index=merged_data.index)

# Encodage de l'âge
age_bins = [0, 18, 25, 35, 45, 50, 56, np.inf]
age_labels = ['0-18', '18-25', '25-35', '35-45', '45-50', '50-56', '56+']
merged_data['age_group'] = pd.cut(merged_data['Age'], bins=age_bins, labels=age_labels)
age_dummies = pd.get_dummies(merged_data['age_group'], prefix='age')

# Encodage du genre
gender_dummies = pd.get_dummies(merged_data['Gender'], prefix='gender')

# Concaténation des nouvelles colonnes encodées avec les données d'origine
merged_data = pd.concat([merged_data, genres_df, age_dummies, gender_dummies], axis=1)

# Supprimer les colonnes originales non numériques
merged_data.drop(columns=['genres', 'age_group', 'Gender'], inplace=True)

In [15]:
# Compter le nombre de films par réalisateur
director_counts = merged_data['directors'].str.split(',').explode().value_counts()

print(director_counts)

directors
Unknown      49477
nm0000229    19388
nm0000709    10612
nm0000318     9358
nm0000631     8454
             ...  
nm0401680        1
nm0136592        1
nm0237876        1
nm0362734        1
nm0748218        1
Name: count, Length: 1637, dtype: int64


In [16]:
# Sélectionner les réalisateurs les plus fréquents
top_directors = director_counts[(director_counts.index != 'Unknown')].nlargest(100).index

# Filtrer les données pour ne conserver que les réalisateurs fréquents
filtered_directors = merged_data['directors'].apply(lambda x: ','.join([d for d in x.split(',') if d in top_directors]))

# Encoder les réalisateurs filtrés
mlb_directors = MultiLabelBinarizer()
directors_encoded = mlb_directors.fit_transform(filtered_directors.str.split(','))
directors_df = pd.DataFrame(directors_encoded, columns=mlb_directors.classes_, index=merged_data.index)

# Concaténer les nouvelles colonnes encodées avec les données d'origine
merged_data = pd.concat([merged_data, directors_df], axis=1)

# Supprimer les colonnes originales non numériques
merged_data.drop(columns=['directors'], inplace=True)


print(merged_data.head())

   MovieID  UserID  Rating  Age  Occupation CleanTitle  Year titleType  \
0        1       1       5    1          10  toy story  1995     movie   
1        1       6       4   50           9  toy story  1995     movie   
2        1       8       4   25          12  toy story  1995     movie   
3        1       9       5   25          17  toy story  1995     movie   
4        1      10       5   35           1  toy story  1995     movie   

   CompositeRating  Action  ...  nm0718645  nm0730860  nm0769874  nm0789313  \
0             9.15       0  ...          0          0          0          0   
1             8.15       0  ...          0          0          0          0   
2             8.15       0  ...          0          0          0          0   
3             9.15       0  ...          0          0          0          0   
4             9.15       0  ...          0          0          0          0   

   nm0829038  nm0958387  nm1140114  nm1370868  nm2091571  nm2091671  
0         

In [17]:
user_item = merged_data.pivot_table(index='UserID', columns='CleanTitle', values='CompositeRating')

In [18]:
print("Size of user_item matrix: ", user_item.shape)

Size of user_item matrix:  (6040, 2341)


In [19]:
user_item_filled = user_item.apply(lambda x: x.fillna(x.mean()), axis=0)

global_mean = user_item_filled.stack().mean() 
user_item_filled = user_item_filled.fillna(global_mean)

print("Size of user_item matrix: \n", user_item_filled.head())

Size of user_item matrix: 
 CleanTitle  'til there was you  1-900  10 things i hate about you  \
UserID                                                              
1                     5.092308    5.6                    7.072857   
2                     5.092308    5.6                    7.072857   
3                     5.092308    5.6                    7.072857   
4                     5.092308    5.6                    7.072857   
5                     5.092308    5.6                    7.072857   

CleanTitle  101 dalmatians  12 angry men  2 days in the valley  20 dates  \
UserID                                                                     
1                 5.896703      8.795455              6.533217  5.656115   
2                 5.896703      8.795455              6.533217  5.656115   
3                 5.896703      8.795455              6.533217  5.656115   
4                 5.896703      8.795455              6.533217  5.656115   
5                 5.896703      

## Préparer les données

In [20]:
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

## Implémenter la factorisation matricielle

In [21]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(train_data[['UserID', 'MovieID', 'CompositeRating']], reader)
algo = SVD()

# Entraînement et évaluation par validation croisée
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8743  0.8748  0.8744  0.8777  0.8758  0.8754  0.0013  
MAE (testset)     0.6880  0.6885  0.6881  0.6908  0.6886  0.6888  0.0010  
Fit time          13.59   13.62   13.72   13.58   13.31   13.56   0.14    
Test time         2.19    1.90    2.36    2.01    2.09    2.11    0.16    


{'test_rmse': array([0.87434882, 0.87477695, 0.87442952, 0.87773674, 0.8758044 ]),
 'test_mae': array([0.68796793, 0.68853385, 0.68814072, 0.69081066, 0.68863106]),
 'fit_time': (13.589710712432861,
  13.619638919830322,
  13.715280055999756,
  13.583953857421875,
  13.305901288986206),
 'test_time': (2.190181016921997,
  1.8971152305603027,
  2.356628656387329,
  2.006834030151367,
  2.0850071907043457)}

## Entraîner et évaluer le modèle

In [22]:
from surprise.model_selection import train_test_split

# Utilise la fonction train_test_split de surprise
trainset, testset = train_test_split(data, test_size=0.25)
algo.fit(trainset)
predictions = algo.test(testset)

# Calculer et afficher le RMSE
rmse = accuracy.rmse(predictions)
print(f'Le RMSE sur le jeu de test est de: {rmse}')

RMSE: 0.8835
Le RMSE sur le jeu de test est de: 0.8835177360317061


## Définir la logique de score de couple

In [23]:
def couple_score(user1, user2, movie_id):
    # Prédire la note pour chaque utilisateur
    score1 = algo.predict(user1, movie_id).est
    score2 = algo.predict(user2, movie_id).est
    # Retourner la moyenne des scores
    return (score1 + score2) / 2

## Recommander des films pour le couple

In [25]:
def recommend_for_couple(user1, user2, merged_data):
    # Extraire les ID de tous les films uniques dans les données
    movies = set(merged_data['MovieID'])
    scores = []
    for movie in movies:
        # Calculer le score de couple pour chaque film
        score = couple_score(user1, user2, movie)
        # Obtenir le titre du film à partir du DataFrame des titres
        title = merged_data[merged_data['MovieID'] == movie]['CleanTitle'].values[0]
        scores.append((title, score))
    # Trier les films par score décroissant
    scores.sort(key=lambda x: x[1], reverse=True)
    # Retourner les 10 meilleurs films recommandés
    return scores[:10]

recommended_movies = recommend_for_couple(1, 2, merged_data)

for title, score in recommended_movies:
    print(f"('{title}', {score})")


('schindler's list', 9.067167848333423)
('saving private ryan', 8.823267187243253)
('forrest gump', 8.810521403984247)
('pulp fiction', 8.776276909429079)
('12 angry men', 8.773918577659817)
('star wars: episode iv - a new hope', 8.732260468432248)
('alien', 8.69332407958769)
('one flew over the cuckoo's nest', 8.691336743716338)
('it's a wonderful life', 8.68878567835656)
('jean de florette', 8.659040671652905)
