In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy

In [2]:
# Paths to the files
movies_path = 'ml-1m/movies.dat'
ratings_path = 'ml-1m/ratings.dat'
users_path = 'ml-1m/users.dat'

# Reading each file correctly
movies = pd.read_csv(movies_path, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')
ratings = pd.read_csv(ratings_path, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin1')
users = pd.read_csv(users_path, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='latin1')

# Displaying the first few rows of each dataframe
print("Movies Data:")
print(movies.head())

print("\nRatings Data:")
print(ratings.head())

print("\nUsers Data:")
print(users.head())


Movies Data:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings Data:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users Data:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


In [3]:
users.drop(columns=['Zip-code'], inplace=True)
ratings.drop(columns=['Timestamp'], inplace=True)

In [4]:
movies_ratings = pd.merge(movies, ratings, on='MovieID')
movielens_data = pd.merge(movies_ratings, users, on='UserID')

print(movielens_data.head())
print("taille : ", len(movielens_data))

   MovieID             Title                       Genres  UserID  Rating  \
0        1  Toy Story (1995)  Animation|Children's|Comedy       1       5   
1        1  Toy Story (1995)  Animation|Children's|Comedy       6       4   
2        1  Toy Story (1995)  Animation|Children's|Comedy       8       4   
3        1  Toy Story (1995)  Animation|Children's|Comedy       9       5   
4        1  Toy Story (1995)  Animation|Children's|Comedy      10       5   

  Gender  Age  Occupation  
0      F    1          10  
1      F   50           9  
2      M   25          12  
3      M   25          17  
4      F   35           1  
taille :  1000209


In [5]:
imdb_basics =  pd.read_csv('title.basics.tsv', delimiter='\t', low_memory=False)
imdb_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t', low_memory=False)
imdb_crew = pd.read_csv('title.crew.tsv', delimiter='\t', low_memory=False)

print("basics :")
print(imdb_basics.head())

print("\nratings :")
print(imdb_ratings.head())

print("\ncrew :")
print(imdb_crew.head())

basics :
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              5  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  

ratings :
      tconst  averageRating  numVotes
0  tt0000001            5.7      2059
1  tt0000002            5.6      

In [6]:
imdb_basics.drop(columns=['endYear'], inplace=True)
imdb_basics.drop(columns=['runtimeMinutes'], inplace=True)
imdb_basics.drop(columns=['isAdult'], inplace=True)

imdb_crew.drop(columns=['writers'], inplace=True) 

In [7]:
# Merge basics with ratings and crew data
imdb_data = imdb_basics.merge(imdb_ratings, on='tconst').merge(imdb_crew, on='tconst')

print(imdb_data.head())
print("taille : ", len(imdb_data))

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  startYear                    genres  averageRating  numVotes  directors  
0      1894         Documentary,Short            5.7      2059  nm0005690  
1      1892           Animation,Short            5.6       278  nm0721526  
2      1892  Animation,Comedy,Romance            6.5      2028  nm0721526  
3      1892           Animation,Short            5.4       179  nm0721526  
4      1893              Comedy,Short            6.2      2795  nm0005690  
taille :  1438597


In [10]:
imdb_data['startYear'] = imdb_data['startYear'].replace('\\N', np.nan)
imdb_data['startYear'] = imdb_data['startYear'].astype(float)
imdb_data.dropna(subset=['startYear'], inplace=True)

In [11]:
# Strip and lowercase titles for better matching
movielens_data['CleanTitle'] = movielens_data['Title'].str.extract(r'^(.*?) \(\d{4}\)$')[0].str.lower().str.strip()
imdb_data['CleanTitle'] = imdb_data['primaryTitle'].str.lower().str.strip()

# Convert years to integers for exact matching
movielens_data['Year'] = movielens_data['Title'].str.extract(r'\((\d{4})\)')[0].astype(int)
imdb_data['Year'] = imdb_data['startYear'].astype(int)

In [12]:
merged_data = pd.merge(movielens_data, imdb_data, left_on=['CleanTitle', 'Year'], right_on=['CleanTitle', 'Year'])

In [20]:
merged_data['ScaledRating'] = merged_data['Rating'] * 2  # Scaling MovieLens ratings from 1-5 to 1-10
merged_data['CompositeRating'] = (merged_data['ScaledRating'] + merged_data['averageRating']) / 2

merged_data.drop(columns=['tconst', 'primaryTitle', 'originalTitle', 'startYear'], inplace=True)

print(merged_data.head())
print("taille : ", len(merged_data))

   MovieID             Title                       Genres  UserID Gender  Age  \
0        1  Toy Story (1995)  Animation|Children's|Comedy       1      F    1   
1        1  Toy Story (1995)  Animation|Children's|Comedy       6      F   50   
2        1  Toy Story (1995)  Animation|Children's|Comedy       8      M   25   
3        1  Toy Story (1995)  Animation|Children's|Comedy       9      M   25   
4        1  Toy Story (1995)  Animation|Children's|Comedy      10      F   35   

   Occupation CleanTitle  Year titleType                      genres  \
0          10  toy story  1995     movie  Adventure,Animation,Comedy   
1           9  toy story  1995     movie  Adventure,Animation,Comedy   
2          12  toy story  1995     movie  Adventure,Animation,Comedy   
3          17  toy story  1995     movie  Adventure,Animation,Comedy   
4           1  toy story  1995     movie  Adventure,Animation,Comedy   

   numVotes  directors  CompositeRating  
0   1075722  nm0005124             9.1

In [21]:
merged_data.drop(['Title', 'Genres'], axis=1, inplace=True)

In [22]:
print(merged_data.head())
print("taille : ", len(merged_data))

   MovieID  UserID Gender  Age  Occupation CleanTitle  Year titleType  \
0        1       1      F    1          10  toy story  1995     movie   
1        1       6      F   50           9  toy story  1995     movie   
2        1       8      M   25          12  toy story  1995     movie   
3        1       9      M   25          17  toy story  1995     movie   
4        1      10      F   35           1  toy story  1995     movie   

                       genres  numVotes  directors  CompositeRating  
0  Adventure,Animation,Comedy   1075722  nm0005124             9.15  
1  Adventure,Animation,Comedy   1075722  nm0005124             8.15  
2  Adventure,Animation,Comedy   1075722  nm0005124             8.15  
3  Adventure,Animation,Comedy   1075722  nm0005124             9.15  
4  Adventure,Animation,Comedy   1075722  nm0005124             9.15  
taille :  805610


In [24]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(merged_data['genres'].str.split(','))
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=merged_data.index)
merged_data = pd.concat([merged_data, genres_df], axis=1)

In [38]:
user_item = merged_data.pivot_table(index='UserID', columns='CleanTitle', values='CompositeRating')

In [39]:
print("Size of user_item matrix: ", user_item.shape)

Size of user_item matrix:  (6040, 2341)


In [53]:
user_item_filled = user_item.apply(lambda x: x.fillna(x.mean()), axis=0)

global_mean = user_item_filled.stack().mean() 
user_item_filled = user_item_filled.fillna(global_mean)

print("Size of user_item matrix: \n", user_item_filled.head())

Size of user_item matrix: 
 CleanTitle  'til there was you  1-900  10 things i hate about you  \
UserID                                                              
1                     5.092308    5.6                    7.072857   
2                     5.092308    5.6                    7.072857   
3                     5.092308    5.6                    7.072857   
4                     5.092308    5.6                    7.072857   
5                     5.092308    5.6                    7.072857   

CleanTitle  101 dalmatians  12 angry men  2 days in the valley  20 dates  \
UserID                                                                     
1                 5.896703      8.795455              6.533217  5.656115   
2                 5.896703      8.795455              6.533217  5.656115   
3                 5.896703      8.795455              6.533217  5.656115   
4                 5.896703      8.795455              6.533217  5.656115   
5                 5.896703      

## Préparer les données

In [60]:
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

## Implémenter la factorisation matricielle

In [63]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(train_data[['UserID', 'MovieID', 'CompositeRating']], reader)
algo = SVD()

# Entraînement et évaluation par validation croisée
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8739  0.8778  0.8759  0.8780  0.8759  0.8763  0.0015  
MAE (testset)     0.6872  0.6906  0.6887  0.6899  0.6897  0.6892  0.0012  
Fit time          13.97   11.36   11.45   11.13   9.66    11.51   1.39    
Test time         1.54    1.75    1.88    1.33    1.46    1.59    0.20    


{'test_rmse': array([0.87389516, 0.87783942, 0.87587608, 0.87796862, 0.87585307]),
 'test_mae': array([0.68722103, 0.69059548, 0.68870506, 0.68987422, 0.6897319 ]),
 'fit_time': (13.968403577804565,
  11.363643884658813,
  11.446570873260498,
  11.134852647781372,
  9.655152320861816),
 'test_time': (1.544600486755371,
  1.7465672492980957,
  1.8793866634368896,
  1.3272809982299805,
  1.4647176265716553)}

## Entraîner et évaluer le modèle

In [73]:
from surprise.model_selection import train_test_split

# Utilise la fonction train_test_split de surprise
trainset, testset = train_test_split(data, test_size=0.25)
algo.fit(trainset)
predictions = algo.test(testset)

# Calculer et afficher le RMSE
rmse = accuracy.rmse(predictions)
print(f'Le RMSE sur le jeu de test est de: {rmse}')

RMSE: 0.8806
Le RMSE sur le jeu de test est de: 0.8806135412498529


## Définir la logique de score de couple

In [74]:
def couple_score(user1, user2, movie_id):
    # Prédire la note pour chaque utilisateur
    score1 = algo.predict(user1, movie_id).est
    score2 = algo.predict(user2, movie_id).est
    # Retourner la moyenne des scores
    return (score1 + score2) / 2

## Recommander des films pour le couple

In [89]:
def recommend_for_couple(user1, user2, merged_data):
    # Extraire les ID de tous les films uniques dans les données
    movies = set(merged_data['MovieID'])
    scores = []
    for movie in movies:
        # Calculer le score de couple pour chaque film
        score = couple_score(user1, user2, movie)
        # Obtenir le titre du film à partir du DataFrame des titres
        title = merged_data[merged_data['MovieID'] == movie]['CleanTitle'].values[0]
        scores.append((title, score))
    # Trier les films par score décroissant
    scores.sort(key=lambda x: x[1], reverse=True)
    # Retourner les 10 meilleurs films recommandés
    return scores[:10]

recommended_movies = recommend_for_couple(50, 13, merged_data)

for title, score in recommended_movies:
    print(f"('{title}', {score})")


('pulp fiction', 8.734018472939749)
('casablanca', 8.597106801264777)
('schindler's list', 8.56550001097175)
('it's a wonderful life', 8.509324475830777)
('goodfellas', 8.481306880487494)
('one flew over the cuckoo's nest', 8.444750905430734)
('to kill a mockingbird', 8.43857912389823)
('star wars: episode iv - a new hope', 8.427903120839328)
('lawrence of arabia', 8.369085075035528)
('rear window', 8.368393759960274)
