# Movie Recommendation System

## Environment settings

In [1]:
import os
import sys

# Add the path to the root directory of the project to the system path
ROOT_PATH = "D:\\Internship"
sys.path.append(os.path.join(ROOT_PATH, "recsys\\movie_recommendation_system\\src"))

# My scripts
from movie_recommender.data.tabular_dataset_handler import TabularDatasetHandler
from movie_recommender.models.svd_train_eval_pred import SVDModelHandler
from movie_recommender.recommenders.popularity_ranking import PopularityRanking
from movie_recommender.recommenders.content_based_filtering import ContentBasedFiltering
from movie_recommender.recommenders.collaborative_filtering import CollaborativeFiltering
from movie_recommender.recommenders.hybrid_filtering import HybridFiltering

# Remove warnings
import warnings; warnings.simplefilter('ignore')

# Data and trained models paths
data_path = os.path.join(ROOT_PATH, "resources\\movielens")
trained_models_path = os.path.join(ROOT_PATH, "trained_models")

# Data preparation

## Data loading

In [2]:
# Define a tabular dataset handler
tdh = TabularDatasetHandler(data_path)

# Load movies_df
tdh.load_datasets()
tdh.get_movies_df_deepcopy().shape

(45466, 24)

In [3]:
tdh.get_movies_df_deepcopy().head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
tdh.get_movies_df_deepcopy().dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

## Data preprocessing

In [5]:
# Tabular dataset preprocessing
tdh.preprocess_datasets()
tdh.get_movies_df_deepcopy().shape

(45433, 22)

In [6]:
tdh.get_movies_df_deepcopy().head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,year
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",,15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995
4,False,Father of the Bride Collection,0,[Comedy],,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995


In [7]:
tdh.get_movies_df_deepcopy().dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                         int32
original_language         object
original_title            object
overview                  object
popularity                object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
vote_average             float64
vote_count               float64
year                      object
dtype: object

# Models testing

## Popularity Ranking

In [8]:
top_movies = PopularityRanking.top_movies_IMDB_wr_formula(tdh, 50)
top_movies.head(10)

Unnamed: 0,id,title,year,vote_count,vote_average,popularity,genres,wr
10309,19404,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,"[Comedy, Drama, Romance]",8.585589
15480,27205,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.984043
12481,155,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.981709
22879,157336,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.979953
2843,550,Fight Club,1999,9678,8,63.869599,[Drama],7.976854
4863,120,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.974826
292,680,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.974188
314,278,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.973234
7000,122,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.972808
351,13,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.972547


In [9]:
top_movies_for_genre = PopularityRanking.top_movies_for_genre(tdh, 'Action', 50)
top_movies_for_genre.head(10)

Unnamed: 0,id,title,year,vote_count,vote_average,popularity,genre,wr
15480,27205,Inception,2010,14075,8,29.108149,Action,7.954991
12481,155,The Dark Knight,2008,12269,8,123.167259,Action,7.948487
4863,120,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,Action,7.929412
7000,122,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,Action,7.92385
5814,121,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,Action,7.918189
256,11,Star Wars,1977,6778,8,42.149697,Action,7.908111
1154,1891,The Empire Strikes Back,1980,5998,8,19.470959,Action,7.896598
4135,111,Scarface,1983,3017,8,11.299673,Action,7.801599
9430,670,Oldboy,2003,2000,8,10.616859,Action,7.711022
1910,346,Seven Samurai,1954,892,8,15.01777,Action,7.425051


## Content-based filtering

In [10]:
tdh.get_movies_df_deepcopy().shape

(45433, 22)

In [11]:
top_similar_movies = ContentBasedFiltering.description_based_recommender(tdh, 'The Dark Knight', 25)
top_similar_movies.head(10)

Unnamed: 0,id,title,year,description
43461,30289,Megafault,2009,A crack in the world has started... we have 24...
21412,212927,Jamie and Jessie Are Not Together,2011,Jamie is moving in two weeks from Chicago to N...
27441,22901,I Know You Know,2009,"Jamie, an 11-year-old boy, is fascinated by hi..."
11689,14001,Dead Silence,2007,Jamie returns to his hometown in search of ans...
3977,10110,Empire of the Sun,1987,"Jamie Graham, a privileged English boy, is liv..."
28231,244562,Jamie Marks Is Dead,2014,No one seemed to care about Jamie Marks until ...
21676,47038,The Pit,1981,Twelve year-old Jamie Benjamin is a solitary m...
864,10987,Halloween: The Curse of Michael Myers,1995,"Six years ago, Michael Myers terrorized the to..."
27204,19525,It Happened in Brooklyn,1947,"Danny has been in the army for 4 years, yet al..."
4904,18317,Truly Madly Deeply,1990,Nina is totally heartbroken at the death of he...


In [12]:
top_similar_movies = ContentBasedFiltering.description_based_recommender(tdh, 'The Dark Knight', 25, improved=True)
top_similar_movies.head(10)

Unnamed: 0,id,title,year,vote_count,vote_average,popularity,wr
12,10229,A Walk to Remember,2002,1057,7,8.054499,6.945427
4,10110,Empire of the Sun,1987,491,7,10.220906,6.886322
14,21734,Shadow of a Doubt,1943,219,7,8.007141,6.762937
19,43347,Love & Other Drugs,2010,1268,6,7.575696,5.978603
3,14001,Dead Silence,2007,533,6,10.763622,5.950747
22,11096,Hide and Seek,2005,455,6,6.102059,5.942852
5,244562,Jamie Marks Is Dead,2014,34,5,8.442355,5.057805
16,157409,Crystal Fairy & the Magical Cactus,2013,65,5,10.82388,5.039255
13,10017,The Wraith,1986,109,5,7.657918,5.02697
7,10987,Halloween: The Curse of Michael Myers,1995,173,5,8.768288,5.018534


In [13]:
top_similar_movies = ContentBasedFiltering.metadata_based_recommender(tdh, 'The Dark Knight', 25, improved=False)
top_similar_movies.head(10)

Unnamed: 0,id,title,year,soup
18326,49026,The Dark Knight Rises,2012,christianbale michaelcaine garyoldman christop...
10158,272,Batman Begins,2005,christianbale michaelcaine liamneeson christop...
11399,1124,The Prestige,2006,hughjackman christianbale michaelcaine christo...
25983,43629,Doodlebug,1997,jeremytheobald christophernolan christophernol...
25982,43629,Doodlebug,1997,jeremytheobald christophernolan christophernol...
5274,320,Insomnia,2002,alpacino robinwilliams hilaryswank christopher...
22952,157336,Interstellar,2014,matthewmcconaughey jessicachastain annehathawa...
4110,77,Memento,2000,guypearce carrie-annemoss joepantoliano christ...
2474,11660,Following,1998,jeremytheobald alexhaw lucyrussell christopher...
15547,27205,Inception,2010,leonardodicaprio josephgordon-levitt ellenpage...


In [14]:
top_similar_movies = ContentBasedFiltering.metadata_based_recommender(tdh, 'The Dark Knight', 25, improved=True)
top_similar_movies.head(10)

Unnamed: 0,id,title,year,vote_count,vote_average,popularity,wr
11,27205,Inception,2010,14075,8,29.108149,7.881503
8,157336,Interstellar,2014,11187,8,32.213481,7.853787
2,1124,The Prestige,2006,4510,8,16.94556,7.681625
9,77,Memento,2000,4168,8,15.450789,7.661191
0,49026,The Dark Knight Rises,2012,9263,7,20.58258,6.938143
1,272,Batman Begins,2005,7511,7,28.505341,6.925645
12,374720,Dunkirk,2017,2712,7,30.938854,6.833503
23,1359,American Psycho,2000,2128,7,13.686715,6.803935
26,142,Brokeback Mountain,2005,1531,7,11.787044,6.760448
7,320,Insomnia,2002,1181,6,11.424974,6.220286


## Collaborative filtering

In [15]:
# SVD model with cross-validation
SVD_model = SVDModelHandler(tabular_dataset_handler=tdh)
SVD_model.train_with_cross_validation(trained_model_path=trained_models_path)
SVD_model.evaluate_performance_with_cross_validation()

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9014  0.8911  0.9029  0.8935  0.8977  0.8973  0.0045  
MAE (testset)     0.6943  0.6870  0.6927  0.6895  0.6895  0.6906  0.0026  
Fit time          0.81    0.83    0.78    0.84    0.79    0.81    0.03    
Test time         0.08    0.08    0.08    0.08    0.08    0.08    0.00    


(0.8973220247952547, 0.6906033634797076)

In [16]:
# SVD model without cross-validation
SVD_model = SVDModelHandler(tabular_dataset_handler=tdh)
SVD_model.train(trained_model_path=trained_models_path)
SVD_model.evaluate_performance()

RMSE: 0.8990
MAE:  0.6906


(0.8989575747218261, 0.6905553223271008)

In [17]:
# Define a collaborative filtering recommender based on the SVD model
SVD_recommender = CollaborativeFiltering(model_handler=SVD_model)

# Predict the rating given by a sample user give to a sample movie
user_id = 1
movie_id = 302
rating = 3
SVD_model.predict(user_id, movie_id, rating)

Prediction(uid=1, iid=302, r_ui=3, est=2.9173434147380335, details={'was_impossible': False})

In [18]:
import pandas as pd

users_ratings_df = tdh.get_users_ratings_df_deepcopy()
movies_df = tdh.get_movies_df_deepcopy()

# Specify the userId for which you want to extract the first record
user_id = 100

# Extract up to the first 10 seen movies for the specific userId
seen_movies_ratings = users_ratings_df[users_ratings_df['userId'] == user_id][['userId', 'movieId', 'rating']]
seen_movies_ratings = seen_movies_ratings.head(10)

# Extract the 'movieId' from the 'seen_movies_ratings'
seen_movies_ids = seen_movies_ratings['movieId']

# Extract the seen movies from the 'movies_df' 
seen_movies = movies_df[movies_df['id'].isin(seen_movies_ids)][['id', 'title']]

# Merge dataframes based on 'id' in 'seen_movies' and 'movieId' in 'seen_movies_ratings'
merged_df = pd.merge(seen_movies, seen_movies_ratings, left_on='id', right_on='movieId', how='left')

# Drop the redundant 'movie_id' column
merged_df = merged_df.drop(columns=['movieId'])

# Rename the 'rating' column to 'gt_rating'
merged_df = merged_df.rename(columns={'rating': 'gt_rating'})

# Add the 'est_rating' column
merged_df['est_rating'] = merged_df.apply(
    lambda row: SVD_recommender.predict(user_id, row['id']).est, axis=1
)

merged_df

Unnamed: 0,id,title,userId,gt_rating,est_rating
0,6,Judgment Night,100,3.0,3.811368
1,62,2001: A Space Odyssey,100,3.0,3.841477
2,88,Dirty Dancing,100,2.0,2.917387
3,25,Jarhead,100,4.0,3.668344
4,86,The Elementary Particles,100,3.0,3.303812
5,3,Shadows in Paradise,100,4.0,3.083656


## Hybrid recommender

In [19]:
hybrid_SVD_recommender = HybridFiltering(collaborative_filtering=SVD_recommender)

In [20]:
# Suggest the user (new and already rate) movies similar to a target movie  
user_id = 1
movie_title = 'The Dark Knight'
hybrid_SVD_recommender.suggest_similar_movies(user_id, movie_title, 25).head(10)

Unnamed: 0,id,title,year,soup,est_rating,gt_rating
10158,272,Batman Begins,2005,christianbale michaelcaine liamneeson christop...,3.134781,
4110,77,Memento,2000,guypearce carrie-annemoss joepantoliano christ...,3.042788,
11399,1124,The Prestige,2006,hughjackman christianbale michaelcaine christo...,3.042672,
40201,112767,Shadow Run,1998,michaelcaine jamesfox matthewpochin nan nan na...,2.957501,
18326,49026,The Dark Knight Rises,2012,christianbale michaelcaine garyoldman christop...,2.833211,
7248,10544,Ned Kelly,2003,heathledger orlandobloom geoffreyrush gregorjo...,2.833211,
24970,147441,Exodus: Gods and Kings,2014,christianbale joeledgerton johnturturro ridley...,2.833211,
4185,9476,A Knight's Tale,2001,heathledger rufussewell shannynsossamon brianh...,2.833211,
4870,35201,The Holcroft Covenant,1985,michaelcaine anthonyandrews victoriatennant jo...,2.833211,
19247,32050,The Black Windmill,1974,michaelcaine donaldpleasence delphineseyrig do...,2.833211,


In [21]:
# Suggest the user (new and already rated) movies similar to a target movie  
user_id = 200
movie_title = 'The Dark Knight'
hybrid_SVD_recommender.suggest_similar_movies(user_id, movie_title, 25).head(10)

Unnamed: 0,id,title,year,soup,est_rating,gt_rating
11399,1124,The Prestige,2006,hughjackman christianbale michaelcaine christo...,3.581634,
10158,272,Batman Begins,2005,christianbale michaelcaine liamneeson christop...,3.55586,
10358,4442,The Brothers Grimm,2005,heathledger mattdamon mackenziecrook terrygill...,3.522216,
4110,77,Memento,2000,guypearce carrie-annemoss joepantoliano christ...,3.467415,
40201,112767,Shadow Run,1998,michaelcaine jamesfox matthewpochin nan nan na...,3.458498,
3633,479,Shaft,2000,samuell.jackson jeffreywright christianbale jo...,3.321119,
5274,320,Insomnia,2002,alpacino robinwilliams hilaryswank christopher...,3.283241,
9177,13428,Two Hands,1999,heathledger bryanbrown rosebyrne gregorjordan ...,3.250035,
7248,10544,Ned Kelly,2003,heathledger orlandobloom geoffreyrush gregorjo...,3.250035,
24970,147441,Exodus: Gods and Kings,2014,christianbale joeledgerton johnturturro ridley...,3.250035,


In [22]:
# MY COMMENT: IT DOES NOT MAKE SENSE TO COMPUTE RMSE AND MAE FOR EVALUATIONG THE PERFORMANCE OF THE FIRST CLASSICAL TECHNIQUES, BECAUSE THEY ARE NOT USER-BASED