# Recommender System

# Data Collection and Preprocessing

#### Path to modify for your situation :

In [2]:
DATA_PATH = "/kaggle/input/recommender-datasets/data"

#### If you haven't already collected the movielens data, You can use the following cell to download it

In [None]:
%%bash
if [ ! -d data/movielens_complete ]; 
then    
    wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
    mkdir -p data/movielens_complete
    unzip -o ml-1m.zip -d data/movielens_complete;
else
    echo "Data already downloaded";
fi

In [None]:
%%bash
if [ ! -d data/imdb ]; 
then    
    mkdir -p data/imdb
    cd data/imdb
    wget https://datasets.imdbws.com/title.basics.tsv.gz
    wget https://datasets.imdbws.com/title.crew.tsv.gz
    gzip -d title.basics.tsv.gz
    gzip -d title.crew.tsv.gz
else
    echo "Data already downloaded";
fi

## Import data

### a) movielens

#### imports

In [3]:
import numpy as np
import pandas as pd
from typing import List
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (20, 13)
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = "retina"

#### I will start by storing the data into dataframes

#### Ratings

In [4]:
df_rating = pd.read_csv(
    f"{DATA_PATH}/movielens_complete/ml-1m/ratings.dat",
    sep="::",
    encoding="ISO-8859-1",
    engine="python",
    header=None
)
df_rating.columns = ["UserId", "MovieId", "Rating", "Timestamp"]

In [5]:
display(df_rating.head(5))

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


#### User data

In [6]:
df_users = pd.read_csv(
    f"{DATA_PATH}/movielens_complete/ml-1m/users.dat",
    sep="::",
    encoding="ISO-8859-1",
    engine="python",
    header=None,
)
df_users.columns = ["UserId", "Gender", "Age", "Occupation", "ZipCode"]

In [7]:
display(df_users.head(5))

Unnamed: 0,UserId,Gender,Age,Occupation,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


#### movies data

In [8]:
movielens_movies = pd.read_csv(
    f"{DATA_PATH}/movielens_complete/ml-1m/movies.dat",
    sep="::",
    encoding="ISO-8859-1",
    engine="python",
    header=None,
)
movielens_movies.columns = ["MovieId", "Title", "Genre"]

In [9]:
display(movielens_movies.head(5))

Unnamed: 0,MovieId,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### b) Imdb

#### movies data

In [10]:
title_basics = pd.read_csv(f"{DATA_PATH}/imdb/title.basics.tsv", sep = '\t', low_memory=False)

In [11]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10900922,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10900923,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10900924,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10900925,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


#### This dataset does not only contain movie data. I then need to get rid of the data that I won't need to use.

In [12]:
imdb_movies = title_basics[(title_basics['titleType'] == 'movie') | (title_basics['titleType'] == 'tvMovie')]
imdb_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
10900845,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
10900850,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,\N,66,Drama
10900857,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
10900867,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,Drama


#### crew data

In [13]:
crew = pd.read_csv(f"{DATA_PATH}/imdb/title.crew.tsv", sep = '\t', low_memory=False)

In [14]:
crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
10206151,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10206152,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10206153,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10206154,tt9916856,nm10538645,nm6951431


### I) 2 - Merging datasets

#### I will merge these two datasets using the title column. The column in imdb corresponding to the title in movielens seems to be primaryTitle. So this is what I will be using. Also, the format of the two datasets are not the same : the year of production is present in the title of the movielens movies. To fix that, I will merge the primaryTitle and the startYear columns of the imdb dataset, which will enable me to obtain a compatible format.

#### First, let's get rid of all the unecessary columns of the imdb title.basics data :

#### - "titleType will" not be necessary as I will be treating movies and TVmovies the same way, and I already got rid of the unwanted types.
#### - "originalTitle" will not be used, primaryTitle will.
#### - "isAdult" is not a relevant parameter for the current recommendation system.
#### - "endYear" is not important here.
#### - "runtimeMinutes" is not a priority, even though it could be used to eliminate possible duplicates.

In [None]:
imdb_movies.drop(['titleType', 'originalTitle', 
                'isAdult', 'endYear', 'runtimeMinutes'], axis=1, inplace=True)

#### Then, let's change the format of the imdb title to fit the movielens one

In [None]:
imdb_movies['Title'] = imdb_movies['primaryTitle'] + " (" + imdb_movies['startYear'].astype(str) + ")"
imdb_movies.drop(['primaryTitle', 'startYear'], axis=1, inplace=True)

#### Also, the Genre columns format are different for these two datasets. I will choose to keep the movielens format, that seems clearer to me.

In [None]:
imdb_movies["genres"]=imdb_movies["genres"].str.replace(',','|')

#### The two datasets can now be merged based on their title column. Given that I will be using informations regarding the users that rated the movies, AND informations about the movies, I need to do an inner merge, to keep only the movies present in both these datasets : the movies that I have enough information on.

In [18]:
imdb_movies.head()

Unnamed: 0,tconst,genres,Title
8,tt0000009,Romance,Miss Jerry (1894)
144,tt0000147,Documentary|News|Sport,The Corbett-Fitzsimmons Fight (1897)
498,tt0000502,\N,Bohemios (1905)
570,tt0000574,Action|Adventure|Biography,The Story of the Kelly Gang (1906)
587,tt0000591,Drama,The Prodigal Son (1907)


In [19]:
movielens_movies.head()

Unnamed: 0,MovieId,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
merged_movies = movielens_movies.merge(imdb_movies, how='inner', on='Title')

In [21]:
merged_movies

Unnamed: 0,MovieId,Title,Genre,tconst,genres
0,1,Toy Story (1995),Animation|Children's|Comedy,tt0114709,Adventure|Animation|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy,tt0113497,Adventure|Comedy|Family
2,3,Grumpier Old Men (1995),Comedy|Romance,tt0113228,Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama,tt0114885,Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,tt0113041,Comedy|Family|Romance
...,...,...,...,...,...
2409,3947,Get Carter (1971),Thriller,tt0067128,Action|Crime|Thriller
2410,3948,Meet the Parents (2000),Comedy,tt0212338,Comedy|Romance
2411,3949,Requiem for a Dream (2000),Drama,tt0180093,Drama
2412,3950,Tigerland (2000),Drama,tt0170691,Drama|War


#### Although the genres have the same names, it's not always the same exact ones. I will choose to keep the one with the most information each time.

In [22]:
def longer_str(s1, s2):
    return s1 if len(s1) >= len(s2) else s2
merged_movies['genre'] = merged_movies.apply(lambda row: longer_str(row['Genre'], row['genres']), axis=1)
merged_movies.drop(['Genre', 'genres'], axis=1, inplace=True)

In [23]:
merged_movies

Unnamed: 0,MovieId,Title,tconst,genre
0,1,Toy Story (1995),tt0114709,Animation|Children's|Comedy
1,2,Jumanji (1995),tt0113497,Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),tt0113228,Comedy|Romance
3,4,Waiting to Exhale (1995),tt0114885,Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),tt0113041,Comedy|Family|Romance
...,...,...,...,...
2409,3947,Get Carter (1971),tt0067128,Action|Crime|Thriller
2410,3948,Meet the Parents (2000),tt0212338,Comedy|Romance
2411,3949,Requiem for a Dream (2000),tt0180093,Drama
2412,3950,Tigerland (2000),tt0170691,Drama|War


#### We now have access to the movie ids for both datasets

#### I can now add informations coming from the imdb dataset

In [24]:
movies = merged_movies.merge(crew, how='left', on='tconst')

In [25]:
movies

Unnamed: 0,MovieId,Title,tconst,genre,directors,writers
0,1,Toy Story (1995),tt0114709,Animation|Children's|Comedy,nm0005124,"nm0005124,nm0230032,nm0004056,nm0710020,nm0923..."
1,2,Jumanji (1995),tt0113497,Adventure|Children's|Fantasy,nm0002653,"nm0378144,nm0852430,nm0833164,nm0885575"
2,3,Grumpier Old Men (1995),tt0113228,Comedy|Romance,nm0222043,nm0425756
3,4,Waiting to Exhale (1995),tt0114885,Comedy|Drama|Romance,nm0001845,"nm0573334,nm0060103"
4,5,Father of the Bride Part II (1995),tt0113041,Comedy|Family|Romance,nm0796124,"nm0352443,nm0329304,nm0583600,nm0796124"
...,...,...,...,...,...,...
2409,3947,Get Carter (1971),tt0067128,Action|Crime|Thriller,nm0388198,"nm0388198,nm0507794"
2410,3948,Meet the Parents (2000),tt0212338,Comedy|Romance,nm0005366,"nm0322839,nm0164898,nm0381272,nm0357453"
2411,3949,Requiem for a Dream (2000),tt0180093,Drama,nm0004716,"nm0782968,nm0004716"
2412,3950,Tigerland (2000),tt0170691,Drama|War,nm0001708,"nm0458462,nm0570077"


#### I will also get rid of the "ZipCode" column of the user informations, that seems to be a less important factor for the choice of a movie

In [26]:
df_users.drop(['ZipCode'], axis=1, inplace=True)

## Extraction of relevant features

#### in the following steps, I decided to use the surprise library, because it was the tool that enabled me to obtain the most accurate and efficient predictions for a custom situation like this one.

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

#### Here is an interaction matrix between the users and the movies they rated

In [28]:
interaction_matrix = df_rating.pivot(index='UserId', columns='MovieId', values='Rating')

In [29]:
interaction_matrix

MovieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


#### Since I didn't use it for the best predictions I made, I didn't leave the code that was using it.

#### Let's preprocess user features

In [30]:
df_users['Gender'] = LabelEncoder().fit_transform(df_users['Gender'])
df_users['Occupation'] = LabelEncoder().fit_transform(df_users['Occupation'])
scaler = MinMaxScaler()
df_users['Age'] = scaler.fit_transform(df_users[['Age']])

#### and movie features

In [31]:
movies['genre'] = movies['genre'].apply(lambda x: x.split('|'))
movies['directors'] = movies['directors'].apply(lambda x: str(x).split(','))
movies['writers'] = movies['writers'].apply(lambda x: str(x).split(','))

#### We can now create a "surprise" dataset. Surprise requires a specific data format to work, so I created it from the dataframes I have.
#### "Reader" defines the rating scale and helps load the data correctly
#### "Dataset" loads tje data info a format that the library algorithms can process.

In [32]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating[['UserId', 'MovieId', 'Rating']], reader)

#### train/test split, with a size of 0.25 for the testset, to have as much data as possible in the trainset, but still have a significant amount in the testset.

In [33]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

#### Initialisation and train of the model. I used a SVD model (Singular Value Decomposition), which is a matrix factorization technique for collaborative filtering. It's suitable for my case because it's a robust and reliable technique to handle user preferences.

In [34]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7bcb5d32afe0>

# Evaluation

#### I chose the RMSE to evaluate this recommender system. It is a metric that heavily penalize large errors and is interpretable very easily.

In [35]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

RMSE: 0.8765
RMSE: 0.8765129356324018


#### Here, we can see that the score is 0.8793, taht represents the error we can expect from our recommender system. For this situation, it seems to be an acceptable error.

#### Now, we can predict movies for our couple of users :

#### To do that, we start with their id, getting their own preferences, before combining them to suggest movies that they will both like. To do that, we create one list for each user, with the rating that they would individually give each movie, according to our predictions. After having sorted these two lists, we can combine the scores and store them in a dictionary. After sorting the elements in a descending order in terms of scores, we can select the number of entries we would like, starting from the top. By default, I set this variable to 1, as the subject only ask for one movie.

In [36]:
def suggest_movies_for_couple(user1_id, user2_id, model, movies, top_n=10):
    user1_predictions = []
    user2_predictions = []
    
    for movie_id in movies['MovieId']:
        user1_predictions.append((movie_id, model.predict(user1_id, movie_id).est))
        user2_predictions.append((movie_id, model.predict(user2_id, movie_id).est))
    
    user1_predictions.sort(key=lambda x: x[1], reverse=True)
    user2_predictions.sort(key=lambda x: x[1], reverse=True)
    
    combined_scores = {}
    for (movie_id, score) in user1_predictions:
        combined_scores[movie_id] = combined_scores.get(movie_id, 0) + score
    for (movie_id, score) in user2_predictions:
        combined_scores[movie_id] = combined_scores.get(movie_id, 0) + score
    
    sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, score in sorted_combined_scores[:top_n]]
    
    return movies[movies['MovieId'].isin(top_movie_ids)].drop_duplicates(subset=['Title'])

#### In this state, only one movie will we displayed. If more suggestions are needed, the variable "top_n" can be adjusted.
#### In that case, the movies will be organized by id.

In [37]:
user1_id = 1
user2_id = 2
top_n = 1
suggested_movies = suggest_movies_for_couple(user1_id, user2_id, model, movies, top_n)
suggested_movies

Unnamed: 0,MovieId,Title,tconst,genre,directors,writers
1733,2820,Hamlet (1964),tt0058126,[Drama],[nm0468882],"[nm0468882, nm0664985, nm0000636]"
