Authors :
- Baptiste Bellamy 

# Load Data

In [None]:
%pip install -r requirements.txt

In [11]:
import pandas as pd
import numpy as np
import csv

### MovieLens

In [12]:
# Load Ml-1m data
def load_Ml_1M_dataset():
    # Ratings data
    ratings_columns = ["UserId", "MovieId", "Rating", "Timestamp"]
    df_rating = pd.read_csv("data/ml-1m/ratings.dat", sep="::", engine="python", encoding="ISO-8859-1", header=None)
    df_rating.columns = ratings_columns

    # users data
    users_columns =  ["UserId", "Gender", "Age", "Occupation", "ZipCode"]
    df_users = pd.read_csv("data/ml-1m/users.dat", sep="::", engine="python", encoding="ISO-8859-1", header=None)
    df_users.columns = users_columns
    
    # movies data
    movies_columns = ["MovieId", "Title", "Genres"]
    df_movies = pd.read_csv("data/ml-1m/movies.dat", sep="::", engine="python", encoding="ISO-8859-1", header=None)
    df_movies.columns = movies_columns

    df_movies['Title'] = df_movies['Title'].astype(str)

    return df_rating, df_users, df_movies

# Process Ml-1m data
def Create_Ratings_table(df_rating, df_users, df_movies):    
    # ratings data
    df_rating = df_rating.drop("Timestamp", axis=1)

    # users data
    df_users = df_users.drop("ZipCode", axis=1)
    df_users = df_users.drop("Occupation", axis=1)

    # movies data
    df_movies["Year"] = df_movies["Title"].str.extract("\((\d{4})\)", expand=True)
    df_movies["Title"] = df_movies["Title"].str.replace(" \(\d{4}\)", "", regex=True)
    df_movies['Genres'] = df_movies['Genres'].str.split('|')

    df_movies['Year'] = df_movies['Year'].astype('Int64')

    return df_rating, df_users, df_movies


In [13]:
df_rating, df_users, df_movies = load_Ml_1M_dataset()
df_rating, df_users, df_movies = Create_Ratings_table(df_rating, df_users, df_movies)
print(f' --- Rating dataset (len: {len(df_rating)})--- ')
display(df_rating.head())
print(f' --- Users dataset (len: {len(df_users)})--- ')
display(df_users.head())
print(f' --- Movies dataset (len: {len(df_movies)})--- ')
display(df_movies.head())

 --- Rating dataset (len: 1000209)--- 


Unnamed: 0,UserId,MovieId,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


 --- Users dataset (len: 6040)--- 


Unnamed: 0,UserId,Gender,Age
0,1,F,1
1,2,M,56
2,3,M,25
3,4,M,45
4,5,M,25


 --- Movies dataset (len: 3883)--- 


Unnamed: 0,MovieId,Title,Genres,Year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


### IMDb

In [16]:
# load IMDB data
def load_IMDB_database():
    
    #title.ratings
    df_title_ratings = pd.read_csv("data/IMDb/title.ratings.tsv", sep="\t", quoting=csv.QUOTE_NONE)
    df_title_ratings['tconst'] = df_title_ratings['tconst'].str.replace('tt', '').astype(int)  # remove 'tt' from tconst
    df_title_ratings = df_title_ratings.rename(columns={'tconst': 'MovieID'}) # rename tconst to MovieID

    #title.basics
    df_title_basics = pd.read_csv("data/IMDb/title.basics.tsv", sep="\t", quoting=csv.QUOTE_NONE)
    df_title_basics['tconst'] = df_title_basics['tconst'].str.replace('tt', '').astype(int)
    df_title_basics = df_title_basics.rename(columns={'tconst': 'MovieID', 'primaryTitle' : 'Title', 'startYear' : 'Year'})

    df_title_basics = df_title_basics[df_title_basics['titleType'] == 'movie'].copy()
    df_title_basics.drop(['titleType', 'endYear'], axis=1, inplace=True)  # drop useless columns

    # set correct types
    df_title_basics['Title'] = df_title_basics['Title'].astype(str)
    df_title_basics['originalTitle'] = df_title_basics['originalTitle'].astype(str)
    df_title_basics['Year'] = df_title_basics['Year'].replace('\\N', np.nan)
    df_title_basics['Year'] = df_title_basics['Year'].astype('Int64')
    df_title_basics['isAdult'] = df_title_basics['isAdult'].astype(int)

    return df_title_ratings, df_title_basics

In [17]:
df_title_ratings, df_title_basics = load_IMDB_database()
print(f' --- Title Ratings dataset (len: {len(df_title_ratings)})--- ')
display(df_title_ratings.head())
print(f' --- Title Basics dataset (len: {len(df_title_basics)})--- ')
display(df_title_basics.head())

 --- Title Ratings dataset (len: 1453791)--- 


Unnamed: 0,MovieID,averageRating,numVotes
0,1,5.7,2063
1,2,5.6,279
2,3,6.5,2030
3,4,5.4,180
4,5,6.2,2796


 --- Title Basics dataset (len: 685593)--- 


Unnamed: 0,MovieID,Title,originalTitle,isAdult,Year,runtimeMinutes,genres
8,9,Miss Jerry,Miss Jerry,0,1894,45,Romance
144,147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,100,"Documentary,News,Sport"
498,502,Bohemios,Bohemios,0,1905,100,\N
570,574,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
587,591,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama


# Merge IMdB and MovieLens

### Transformation / Preprocessing

In [18]:
# Set the same title format for both datasets (The)
def transform_title(title):  
    if title.endswith(", The"):
        return "The " + title.replace(", The", "")
    return title

# Delete '(...)' from title -> ex : 'MovieName (yoyoyo)' -> 'MovieName'
def extract_main_title(title):
    return title.split(' (')[0]

# Delete 'The' from title -> ex : 'The MovieName' -> 'MovieName'
def delete_The(title):
    if title.startswith("The"):
        return title.replace("The ", "", 1)
    return title

# Delete '!' from title -> ex : 'MovieName!' -> 'MovieName'
def delete_exclamation(title):
    return title.replace("!", "")

# Lowercase title
def to_lowercase(title):
    return title.lower()

# Replace 'and' by '&' and '+' by '&'
def replace_symbole(title):
    tmp = title.replace(" and ", "&")
    return tmp.replace("+", "&")

# Apply all the transformations on both datasets (IMDB and Ml-1m for movies titles)
df_movies['Title'] = df_movies['Title'].apply(transform_title)
df_movies['Title'] = df_movies['Title'].apply(extract_main_title)

df_movies['Title'] = df_movies['Title'].apply(delete_The)
df_title_basics['Title'] = df_title_basics['Title'].apply(delete_The)

df_title_basics['Title'] = df_title_basics['Title'].apply(delete_exclamation)
df_movies['Title'] = df_movies['Title'].apply(delete_exclamation)

df_movies['Title'] = df_movies['Title'].apply(to_lowercase)
df_title_basics['Title'] = df_title_basics['Title'].apply(to_lowercase)

df_movies['Title'] = df_movies['Title'].apply(replace_symbole)
df_title_basics['Title'] = df_title_basics['Title'].apply(replace_symbole)

### Merge

In [19]:
#Merge on Title and Year (because some movies have the same title but on different years)
merged_df = pd.merge(df_movies, df_title_basics, how='inner', on=['Title', 'Year'])
print(f' --- Merged dataset (len: {len(merged_df)})--- ')
merged_df.head()

 --- Merged dataset (len: 3192)--- 


Unnamed: 0,MovieId,Title,Genres,Year,MovieID,originalTitle,isAdult,runtimeMinutes,genres
0,1,toy story,"[Animation, Children's, Comedy]",1995,114709,Toy Story,0,81,"Adventure,Animation,Comedy"
1,2,jumanji,"[Adventure, Children's, Fantasy]",1995,113497,Jumanji,0,104,"Adventure,Comedy,Family"
2,3,grumpier old men,"[Comedy, Romance]",1995,113228,Grumpier Old Men,0,101,"Comedy,Romance"
3,4,waiting to exhale,"[Comedy, Drama]",1995,114885,Waiting to Exhale,0,124,"Comedy,Drama,Romance"
4,5,father of the bride part ii,[Comedy],1995,113041,Father of the Bride Part II,0,106,"Comedy,Family,Romance"


In [20]:
#post-process the merged dataset
# Merge Genres
merged_df['genres'] = merged_df['genres'].str.split(',')
merged_df['Genres'] = merged_df.apply(lambda row: list(set(row['Genres'] + row['genres'])), axis=1)

#Delete \\N from Genres
merged_df['Genres'] = merged_df['Genres'].apply(lambda genres_list: [genre for genre in genres_list if genre != '\\N'])

#Drop useless columns
merged_df.drop(columns=['genres'], inplace=True)
merged_df.drop(columns=['originalTitle'], inplace=True)

display(merged_df)


Unnamed: 0,MovieId,Title,Genres,Year,MovieID,isAdult,runtimeMinutes
0,1,toy story,"[Animation, Children's, Adventure, Comedy]",1995,114709,0,81
1,2,jumanji,"[Family, Fantasy, Adventure, Comedy, Children's]",1995,113497,0,104
2,3,grumpier old men,"[Romance, Comedy]",1995,113228,0,101
3,4,waiting to exhale,"[Romance, Drama, Comedy]",1995,114885,0,124
4,5,father of the bride part ii,"[Romance, Family, Comedy]",1995,113041,0,106
...,...,...,...,...,...,...,...
3187,3948,meet the parents,"[Romance, Comedy]",2000,212338,0,108
3188,3949,requiem for a dream,[Drama],2000,180093,0,102
3189,3950,tigerland,"[War, Drama]",2000,170691,0,101
3190,3951,two family house,"[Comedy, Romance, Drama]",2000,202641,0,108


In [21]:
# Merge with IMdB ratings (more movie's informations)
movies = pd.merge(merged_df, df_title_ratings, how='left', on=['MovieID'])
movies['numVotes'].fillna(0, inplace=True)
movies['averageRating'].fillna(0, inplace=True)
movies['averageRating'] = movies['averageRating'].astype(float)
movies['averageRating'] = movies['averageRating'] / 2
movies['numVotes'] = movies['numVotes'].astype(int)

# Create a column for each genre
all_genres = sorted(set(genre for sublist in movies['Genres'] for genre in sublist))
print(all_genres)
for genre in all_genres:
    movies[genre] = 0
for index, row in movies.iterrows():
    for genre in row['Genres']:
        movies.at[index, genre] = 1

# Drop useless columns
movies.drop(columns=['Genres'], inplace=True)
movies.drop(columns=['MovieID'], inplace=True)
movies

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['numVotes'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['averageRating'].fillna(0, inplace=True)


['Action', 'Adult', 'Adventure', 'Animation', 'Biography', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


Unnamed: 0,MovieId,Title,Year,isAdult,runtimeMinutes,averageRating,numVotes,Action,Adult,Adventure,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,toy story,1995,0,81,4.15,1076431,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,1995,0,104,3.55,380163,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,1995,0,101,3.30,29874,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,waiting to exhale,1995,0,124,3.00,12314,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,father of the bride part ii,1995,0,106,3.05,41946,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187,3948,meet the parents,2000,0,108,3.50,357427,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3188,3949,requiem for a dream,2000,0,102,4.15,904474,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3189,3950,tigerland,2000,0,101,3.45,43504,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3190,3951,two family house,2000,0,108,3.60,1734,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# Feature Engineering

## Extract Features

In [22]:
# Create correcte dataset for the recommendation system
def extract_features(df_rating, df_users, df_movies):
    # set the types of the columns
    df_rating['Rating'] = df_rating['Rating'].astype(float)
    df_rating['UserId'] = df_rating['UserId'].astype(int)
    df_rating['MovieId'] = df_rating['MovieId'].astype(int)

    df_users['UserId'] = df_users['UserId'].astype(int)
    df_users['Age'] = df_users['Age'].astype(int)
    df_users['Gender'] = df_users['Gender'].map({'M': 0, 'F': 1})

    df_movies.replace('\\N', np.nan, inplace=True)
    df_movies.fillna(0, inplace=True)

    return df_rating, df_users, df_movies
ratings, users, movies = extract_features(df_rating, df_users, movies)

## User-Item matrix

In [23]:
#User-Item matrix
user_item_matrix = df_rating.pivot(index='UserId', columns='MovieId', values='Rating')
user_item_matrix.fillna(0, inplace=True)
display(user_item_matrix.head())

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#Save the datasets, we will use them in the next notebook (info/Experiment.ipynb)
def save_datasets():
    ratings.to_csv('data/save/ratings.csv', index=False)
    users.to_csv('data/save/users.csv', index=False)
    movies.to_csv('data/save/movies.csv', index=False)

#save_datasets()

# Model Development

## Spark ALS

In [25]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
import logging

logging.getLogger().setLevel(logging.INFO)

# Set plot parameters
plt.rcParams["figure.figsize"] = (20, 13)
%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [26]:
# Create a Spark session
spark = SparkSession.builder.appName("ALSMatrixFactorisation").getOrCreate()

In [27]:
# Create a Spark DataFrame from a pandas DataFrame
ratings_spark = spark.createDataFrame(ratings)
users_spark = spark.createDataFrame(users)
movies_spark = spark.createDataFrame(movies)
df_pred = ratings_spark.join(users_spark, on='UserId').join(movies_spark, on='MovieId')

# indexer = [
#     StringIndexer(inputCol=column, outputCol=column + "_index")
#     for column in list(set(df_pred.columns) - set(["rating"]))
# ]
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index") 
    for column in ['UserId', 'MovieId']
]

# Create a pipeline
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_pred).transform(df_pred)
transformed.show()

+-------+------+------+------+---+--------------------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+-------------+
|MovieId|UserId|Rating|Gender|Age|               Title|Year|isAdult|runtimeMinutes|averageRating|numVotes|Action|Adult|Adventure|Animation|Biography|Children's|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|History|Horror|Music|Musical|Mystery|Romance|Sci-Fi|Sport|Thriller|War|Western|UserId_index|MovieId_index|
+-------+------+------+------+---+--------------------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+-------------+
|   3506|    26|   4.0|     0| 

In [28]:
# Split the data into training and test sets
(training, test) = transformed.randomSplit([0.8, 0.2], seed=42)

In [29]:
# Build the recommendation model using ALS on the training data
als = ALS(
    maxIter=5,
    regParam=0.09,
    rank=25,
    userCol="UserId_index",
    itemCol="MovieId_index",
    ratingCol="Rating",
    coldStartStrategy="drop",
    nonnegative=True,
)

# Fit the ALS model to the training set
model = als.fit(training)

In [30]:
training_predictions = model.transform(training)
print("Training dataset predictions")
training_predictions.show()

Training dataset predictions
+-------+------+------+------+---+---------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+-------------+----------+
|MovieId|UserId|Rating|Gender|Age|    Title|Year|isAdult|runtimeMinutes|averageRating|numVotes|Action|Adult|Adventure|Animation|Biography|Children's|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|History|Horror|Music|Musical|Mystery|Romance|Sci-Fi|Sport|Thriller|War|Western|UserId_index|MovieId_index|prediction|
+-------+------+------+------+---+---------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+-------------+----------+
| 

In [31]:
#Evaluate the model
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="Rating", predictionCol="prediction"
)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.show()

RMSE=0.8796186555993644
+-------+------+------+------+---+----------------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+-------------+----------+
|MovieId|UserId|Rating|Gender|Age|           Title|Year|isAdult|runtimeMinutes|averageRating|numVotes|Action|Adult|Adventure|Animation|Biography|Children's|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|History|Horror|Music|Musical|Mystery|Romance|Sci-Fi|Sport|Thriller|War|Western|UserId_index|MovieId_index|prediction|
+-------+------+------+------+---+----------------+----+-------+--------------+-------------+--------+------+-----+---------+---------+---------+----------+------+-----+-----------+-----+------+-------+---------+-------+------+-----+-------+-------+-------+------+-----+--------+---+-------+------------+------------

The model is efficient (**RMSE** is quite low).

# Recommendation Algorithm

In [93]:
user_recs = model.recommendForAllUsers(10)
user_recs.show()

+------------+--------------------+
|UserId_index|     recommendations|
+------------+--------------------+
|           1|[{2984, 5.6956687...|
|          12|[{2984, 5.4385176...|
|          22|[{2985, 5.0158916...|
|          26|[{2984, 5.584358}...|
|          27|[{2984, 5.455725}...|
|          28|[{2985, 5.677609}...|
|          31|[{2985, 4.6694717...|
|          34|[{2984, 4.383502}...|
|          44|[{2953, 4.2398953...|
|          47|[{2985, 4.849185}...|
|          52|[{2985, 4.5756693...|
|          53|[{2984, 5.7903295...|
|          65|[{2984, 4.429078}...|
|          76|[{2892, 5.1394987...|
|          78|[{2985, 5.2433043...|
|          81|[{2985, 5.2183666...|
|          85|[{2985, 4.954113}...|
|          91|[{2984, 4.829773}...|
|          93|[{2984, 4.7932863...|
|         101|[{2984, 5.826962}...|
+------------+--------------------+
only showing top 20 rows



In [97]:
#transformed user index to user id and movie index to movie id in the recommendations
# return a pandas dataframe (instead of spark dataframe) with the user id and the list of recommended movies
#code is adapted from ALS recommender system course
def users_recommandation(num_recommendations=10):
    recs = model.recommendForAllUsers(num_recommendations).toPandas()
    df_recs = (
        recs.recommendations.apply(pd.Series)
        .merge(recs, right_index=True, left_index=True)
        .drop(["recommendations"], axis=1)
        .melt(id_vars=["UserId_index"], value_name="recommendation")
        .drop("variable", axis=1)
        .dropna()
    )

    df_recs = df_recs.sort_values("UserId_index")
    df_recs = pd.concat(
        [df_recs["recommendation"].apply(pd.Series), df_recs["UserId_index"]], axis=1
    )

    df_recs.columns = ["MovieId_index", "Rating", "UserId_index"]
    tmp = transformed.select(
        transformed["UserId"],
        transformed["UserId_index"],
        transformed["MovieId"],
        transformed["MovieId_index"],
    )
    tmp = tmp.toPandas()

    dict1 = dict(zip(tmp["UserId_index"], tmp["UserId"]))
    dict2 = dict(zip(tmp["MovieId_index"], tmp["MovieId"]))

    df_recs_copy = df_recs.copy()
    df_recs_copy.loc[:, "UserId"] = df_recs["UserId_index"].map(dict1)
    df_recs_copy.loc[:, "MovieId"] = df_recs["MovieId_index"].map(dict2)
    df_recs_copy = df_recs_copy.sort_values("UserId")
    df_recs_copy.reset_index(drop=True, inplace=True)

    new = df_recs_copy[["UserId", "MovieId", "Rating"]]
    new["recommendations"] = list(zip(new.MovieId, new.Rating))

    res = new[["UserId", "recommendations"]]
    res_new = res["recommendations"].groupby([res.UserId]).apply(list).reset_index()

    #print(res_new)
    return res_new

In [98]:
# Get the name of the movies from their ids
def get_Movies_Name(movies, recommended_movie_ids):
    names = []
    for movie_id in recommended_movie_ids:
        movie_name = df_movies.loc[df_movies['MovieId'] == movie_id, 'Title'].values[0]
        names.append(movie_name)
    return names

# Get the movie Id from the dataframe
def get_movie_ids(user_id, dataframe):
    # Trouver les recommandations pour l'utilisateur donné
    user_recommendations = dataframe[dataframe['UserId'] == user_id]['recommendations'].values
    if len(user_recommendations) == 0:
        return []
    # Extraire les MovieId des recommandations
    movie_ids = [movie_id for movie_id, rating in user_recommendations[0]]
    return movie_ids

In [100]:
res_new = users_recommandation(15)
res_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["recommendations"] = list(zip(new.MovieId, new.Rating))


Unnamed: 0,UserId,recommendations
0,1,"[(1207, 4.477933406829834), (1851, 5.046184539..."
1,2,"[(953, 4.253499507904053), (3233, 4.5409717559..."
2,3,"[(356, 4.479243755340576), (527, 4.31512165069..."
3,4,"[(2930, 4.5216898918151855), (2494, 4.57206249..."
4,5,"[(2132, 3.8848812580108643), (3949, 3.82977628..."


In [101]:
#Test the recommendation system for the one user
user_id = 1
movie_ids = get_movie_ids(user_id, res_new)
print(f"Les MovieIds recommandés pour l'utilisateur {user_id} sont : {movie_ids}")

Les MovieIds recommandés pour l'utilisateur 1 sont : [1207, 1851, 318, 1132, 3233, 572, 953, 919, 2197, 1035, 904, 557, 527, 858, 2562]


### Recommandation for a couple

In [127]:
def get_Recommended_Movies(user1_id, user2_id, dataframe):
    user1_movie_ids = get_movie_ids(user1_id, dataframe)
    user2_movie_ids = get_movie_ids(user2_id, dataframe)

    #print(f"The recommended MovieIds for user {user1_id} are: {user1_movie_ids}")
    #print(f"The recommended MovieIds for user {user2_id} are: {user2_movie_ids}")

    # find the common recommended movies
    recommended_movie_ids = set(user1_movie_ids).intersection(user2_movie_ids)

    # if there are less than 2 common movies, we add additional movies (from the user's recommendations)
    if len(recommended_movie_ids) <= 2:
        additional_movies = list(recommended_movie_ids)
        user1_additional = [movie_id for movie_id in user1_movie_ids if movie_id not in recommended_movie_ids]
        user2_additional = [movie_id for movie_id in user2_movie_ids if movie_id not in recommended_movie_ids]
        
        additional_needed = 5 - len(additional_movies)
        additional_needed_user1 = min(additional_needed // 2 + additional_needed % 2, len(user1_additional))
        additional_needed_user2 = min(additional_needed // 2, len(user2_additional))
        
        additional_movies.extend(user1_additional[:additional_needed_user1])
        additional_movies.extend(user2_additional[:additional_needed_user2])

        recommended_movie_ids = additional_movies

    return get_Movies_Name(movies, recommended_movie_ids)

In [128]:
# Test the recommendation system for two users
user1_id = 1  #132
user2_id = 2  #98
recommendations = get_Recommended_Movies(user1_id, user2_id, res_new)
print(f"Recommended movies for users {user1_id} and {user2_id} are:")
for movie in recommendations:
    print(" -", movie)

Recommended movies for users 1 and 2 are:
 - smashing time
 - bandits
 - schindler's list
 - it's a wonderful life
 - leather jacket love story
 - foreign student
 - shawshank redemption


### Conclusion

This model is not based solely on predicting users' ratings based on the ratings they have already given. The model is not precise but will predict movie recommendations similar to those already liked by both users.