# Data preprocessing

In [1]:
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_colab():
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import subprocess
import sys

try:
    __import__("surprise")
    print('surprise is already installed.')
except ImportError:
    print(f'surprise is not installed. Installing...')
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-surprise"])

surprise is not installed. Installing...


In [3]:
import os
import random
import time

import numpy as np
import pandas as pd
import seaborn as sns
import requests
import surprise
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader
from surprise.model_selection import train_test_split

# fix randomness
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [4]:
# Load and process movies data
#work_directory = './'
work_directory = '/content/drive/MyDrive/Colab Notebooks/NAML_Labs/NAML_Project/'
movies_path = work_directory + 'Data/movies.dat'
ratings_path = work_directory + 'Data/ratings.dat'
movies_columns = ["MovieID", 'MovieTitle(Year)', 'Genre']
ratings_columns = ["UserID", "MovieID", "Ratings", "RatingTimestamp"]

# Read and process the movies data
data_movies = pd.read_csv(movies_path, delimiter='::', names=movies_columns, engine='python')
data_movies[['Title', 'Release_year']] = data_movies['MovieTitle(Year)'].str.extract(r'(.*)\s+\((\d+)\)')
data_movies.drop(columns=['MovieTitle(Year)'], inplace=True)
data_movies['Release_year'] = pd.to_numeric(data_movies['Release_year'])
data_movies = data_movies.query('2014 <= Release_year <= 2017').drop_duplicates().reset_index(drop=True)
data_movies['Genre'] = data_movies['Genre'].str.split('|')
data_movies.dropna(subset=['Genre'], inplace=True)

# Read and filter the ratings data
data_ratings = pd.read_csv(ratings_path, delimiter='::', names=ratings_columns, engine='python')
data_ratings = data_ratings[data_ratings['MovieID'].isin(data_movies['MovieID'])].groupby('UserID').filter(
    lambda x: len(x) >= 20).reset_index(drop=True)

# Update data_movies to keep only movies present in filtered data_ratings
data_movies = data_movies[data_movies['MovieID'].isin(data_ratings['MovieID'])].reset_index(drop=True)

In [5]:
# Load movie attributes and association data
movie_attributes = pd.read_json(os.path.join(work_directory, 'Data/movie_tmdb_attributes.json'))
tmdb_imdb_association = pd.read_json(os.path.join(work_directory, 'Data/tmdb_imdb_association.json'))

# Filter movies present in the association table
movie_attributes = movie_attributes[movie_attributes['id'].isin(tmdb_imdb_association['tmdb_id'])].reset_index(
    drop=True)

# Select and transform the necessary columns
movie_attributes_selected = movie_attributes[
    ['id', 'title', 'runtime', 'original_language', 'popularity', 'budget']].copy()
movie_attributes_selected['production_companies'] = movie_attributes['production_companies'].apply(
    lambda x: [d['name'] for d in x])
movie_attributes_selected['production_countries'] = movie_attributes['production_countries'].apply(
    lambda x: [d['name'] for d in x])
# drop duplicates by id
movie_attributes_selected.drop_duplicates(subset=['id'], inplace=True)

# Load movie credits data
movie_credits = pd.read_json(os.path.join(work_directory, 'Data/movie_credits.json'))

# Filter and remove duplicates by 'id'
movie_credits = movie_credits[movie_credits['id'].isin(tmdb_imdb_association['tmdb_id'])].drop_duplicates(
    subset=['id']).reset_index(drop=True)

# Extract relevant cast and crew information
movie_contribution = pd.DataFrame({
    'id': movie_credits['id'],
    'actors': movie_credits['cast'].apply(lambda x: [d['name'] for d in x]),
    'actors_popularity': movie_credits['cast'].apply(lambda x: [d['popularity'] for d in x]),
    'directors': movie_credits['crew'].apply(
        lambda x: [d['name'] for d in x if d['known_for_department'] == 'Directing']),
    'writers': movie_credits['crew'].apply(lambda x: [d['name'] for d in x if d['known_for_department'] == 'Writing'])
})

# Merge the movie attributes and contribution data on 'id'
tmdb_movies = pd.merge(movie_attributes_selected, movie_contribution, on='id', how='inner')
tmdb_movies.shape

(4524, 12)

In [6]:
ground_truth_path = work_directory + 'Data/groundtruth.json'
api_key = "2d1b8795af328fd67395ac695c841792"

# Evaluation Functions

In [7]:
def precision_at_k(testset_df, recommender_object, k=10, relevance_threshold=5, verbose=1, users_to_monitor=100):
    """
    Computes the precision at K for a recommender system over a test set.

    Precision at K measures how many of the top K recommended items are
    relevant, where relevance is determined by a threshold on the true ratings
    (i.e., items rated higher than the threshold are considered relevant).

    Parameters:
    -----------
    testset_df : pandas.DataFrame
        The test set containing actual ratings from users. The DataFrame should
        have the following columns:
        - 'UserID': User IDs
        - 'id': Item IDs (e.g., movie IDs)
        - 'Rarings': True ratings given by the user (ground truth).

    recommender_object : object
        The recommender system object. This object should have a
        `recommend(user_id, k)` method, which returns a list of the top K
        recommended items for the given user.

    k : int, optional (default=10)
        The number of top items to consider for calculating precision. For
        example, if K=10, precision will be calculated for the top 10
        recommendations.

    relevance_threshold : int or float, optional (default=5)
        The rating threshold above which items are considered relevant. For
        instance, if the true rating (r_ui) for an item is greater than or
        equal to `relevance_threshold`, the item is considered relevant.

    verbose : int, optional (default=1)
        Controls whether to print progress information during the computation.
        - If verbose=1, prints progress updates.
        - If verbose=0, no output will be printed.

    Returns:
    --------
    mean_precision : float
        The average precision at K across all users in the test set.

    Notes:
    ------
    - Precision at K is calculated for each user by dividing the number of
      relevant items in the top K recommendations by K (or fewer, if fewer than K
      items are recommended).
    - The precision scores for all users are averaged to compute the final
      result.
    """

    test_users = testset_df['UserID'].unique()
    n_users = test_users.shape[0]

    relevant_movies = testset_df[testset_df['Ratings'] > relevance_threshold].groupby('UserID')['id'].apply(list).to_dict()
    precision_values = []

    # variables to monitor the time required for the processing
    counter = 0
    avg_processing_time = 0
    start_time = time.time()

    # Iterate over each user in the test set
    for user_id in test_users:

        counter += 1
        if verbose > 0 and counter % users_to_monitor == 0:
            elapsed_time = time.time() - start_time
            avg_processing_time = elapsed_time / (counter + 1)
            estimated_time = (n_users - counter) * avg_processing_time
            print(f"{counter:5d} users processed in {elapsed_time//60:2.0f}m {elapsed_time%60:2.0f}s. "
                  f"Estimated time to completion: {estimated_time//60:2.0f}m {estimated_time%60:2.0f}s")

        # Get a list  of all the movies the user has not rated
        # Consider only movies rated above a rating threshold
        user_relevant_movies = relevant_movies.get(user_id, [])

        if not user_relevant_movies:
            continue

        # Predict the ratings for the new movies
        try:
            top_k_recommendations = recommender_object.recommend(user_id, k)
        except Exception as e:
            print(f"Error recommending for user {user_id}: {e}")
            continue
        # Calculate precision for the user
        precision = len(set(top_k_recommendations) & set(user_relevant_movies)) / max(1, min(k, len(top_k_recommendations)))
        precision_values.append(precision)


    # Average precision over all users
    mean_precision = np.mean(precision_values)

    return mean_precision


# a simpler way to compute precision;
# just pass the list of recommendations and relevant
# movies for each user in the testset
def precision(recommendations, relevants):
    """
        recommendations: a list f recommended items for each user
        relevants: a list of relevant items for each user
    """
    precision = 0
    if len(recommendations) > 0 and len(relevants) > 0:
        precision = len(set(recommendations) & set(relevants)) / len(recommendations)

    return precision

# CF Recommender System

## Data Preparation for CF

In [8]:
# Merge the user_rating with the tmdb_movie_id and keep only the TMDB_movie_id as unique identifier of the movie
user_rating_tmdb = pd.merge(
    data_ratings,
    tmdb_imdb_association,
    left_on='MovieID',
    right_on='imdb_id'
).drop(['MovieID', 'imdb_id'], axis=1)

# Keep only the data needed for the CF recommender system
flattened_URM = user_rating_tmdb[['UserID', 'tmdb_id', 'Ratings']]
flattened_URM = flattened_URM.rename(columns={'tmdb_id': 'id'})

# Merge tmdb_movies and the user_ratings
movie_user_data = pd.merge(flattened_URM, tmdb_movies, on='id')

In [9]:
from sklearn.model_selection import train_test_split

train_set, validation_set = train_test_split(flattened_URM, test_size=0.2, random_state=RANDOM_SEED, shuffle=True, stratify=flattened_URM['UserID'])

## Most popular Recommender

A recommender system that suggest the most highly rated movies in the dataset (movie with the highest average rating).


We are testing the precision function on a recsys which always suggest the top rated movies in the DB.

In [None]:
class TopPopRecommender:

    def __init__(self, flattened_URM, correcting_factor=20, correcting_factor_metric=None):
        """
        Initializes the class with a flattened User-Rating Matrix (URM) and a correcting factor.

        Parameters:
        -----------
        flattened_URM : pandas.DataFrame
            A flattened URM containing user ratings for various items (e.g., movies, products).
            The DataFrame should have at least two columns: 'id' (the item ID) and 'Ratings' (the user ratings).
        correcting_factor : int, optional (default=20)
            A factor used to adjust the importance of items with few ratings. Defaults to the 20.
        """

        ratings_count = flattened_URM.groupby('id').size()

        # a correcting factor to lower the importance of movies
        # with few ratings
        if correcting_factor_metric == 'avg':
            correcting_factor = ratings_count.mean()
        elif correcting_factor_metric == 'median':
            correcting_factor = ratings_count.median()

        mean_rating_per_movie = flattened_URM.groupby('id')['Ratings'].apply(lambda x: sum(x)/(len(x) + correcting_factor))

        self.mean_rating_per_movie = mean_rating_per_movie.sort_values(ascending=False)

    def recommend(self, user_id, k):
        # the user_id is required as input to conform to the reccomender_object
        # used in the precision function, but is not functional to how the
        # suggestion are produced. The recommendations are the same for every user
        return self.mean_rating_per_movie.head(k).keys().tolist()

In [None]:
# check that top_pop_recsys works
user = 1
top_pop_recsys = TopPopRecommender(train_set)
top5 = top_pop_recsys.recommend(user, 5)

print('\nMost Popular Recommendations:')
print("\n".join(f"{i + 1}.\t{tmdb_movies[tmdb_movies['id'] == id]['title'].values[0]}" for i, id in enumerate(top5)))
print("\n")


Most Popular Recommendations:
1.	Interstellar
2.	Whiplash
3.	Hacksaw Ridge
4.	Coco
5.	Inside Out




The ouput seems quite reasonable, all quite popular movies.

Next, we will adjust the correction factor to observe its impact on the recommendations.

In [None]:
# experiment with the correcting factor
top_pop_recsys = TopPopRecommender(train_set, correcting_factor=0)
top5 = top_pop_recsys.recommend(user, 5)

print('\nMost Popular Recommendations:')
print("\n".join(f"{i + 1}.\t{tmdb_movies[tmdb_movies['id'] == id]['title'].values[0]}" for i, id in enumerate(top5)))
print("\n")


Most Popular Recommendations:
1.	New Life
2.	Wizard Mode
3.	Negar
4.	Arvydas Sabonis 11
5.	Shock Room




If we do not use a correcting factor we get recommended unknown movies; The system is biased towards movies that have received the highest ratign by very few users.

Now, let's test different correction factors to evaluate their impact.

In [None]:
# experiment with the correcting factor
# use the mean as the correcting factor
top_pop_recsys = TopPopRecommender(train_set, correcting_factor_metric='avg')
top5 = top_pop_recsys.recommend(1, 5)

print('\nMost Popular Recommendations:')
print("\n".join(f"{i + 1}.\t{tmdb_movies[tmdb_movies['id'] == id]['title'].values[0]}" for i, id in enumerate(top5)))
print("\n")


Most Popular Recommendations:
1.	Interstellar
2.	Whiplash
3.	Hacksaw Ridge
4.	La La Land
5.	Inside Out




In [None]:
# experiment with the correcting factor
# use the median as the correcting factor
top_pop_recsys = TopPopRecommender(train_set, correcting_factor_metric='median')
top5 = top_pop_recsys.recommend(1, 5)

print('\nMost Popular Recommendations:')
print("\n".join(f"{i + 1}.\t{tmdb_movies[tmdb_movies['id'] == id]['title'].values[0]}" for i, id in enumerate(top5)))
print("\n")


Most Popular Recommendations:
1.	Interstellar
2.	Dangal
3.	Coco
4.	Whiplash
5.	Hacksaw Ridge




## Test precision function

First, we create a fictional test set where each user has at least one of the top 5 highest-rated movies as a favorite (rating above the threshold).

In [None]:
top5 = top_pop_recsys.recommend(1, 5)
threshold = 5

fict_test_set = validation_set.loc[validation_set['id'].isin(top5) & (validation_set['Ratings'] > threshold)]

The way fict_test_set is construted gurantees that the precision @ 5 is at least 0.2, as there is always one relevant movie (in the top5) for each user. Additionally, it's unlikely that all relevant movies for a user that are also in the top 5 would appear in the validation set and so we expect the precision to be close to 0.2.

In [None]:
precision = precision_at_k(fict_test_set, top_pop_recsys, 5)

print(f"\nPrecision @ 5: {precision:.4f}")

100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s

Precision @ 5: 0.2164


Using the same reasoning as for precision@5, the precision@10 should be approximately 0.1, since there is still one relevant movie for each user among the top 10 recommendations

In [None]:
precision = precision_at_k(fict_test_set, top_pop_recsys, 10)

print(f"\nPrecision @ 5: {precision:.4f}")

100 users processed in 0 seconds. Estimated time to completion: 0min - 1s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s
100 users processed in 0 seconds. Estimated time to completion: 0min - 0s

Precision @ 5: 0.1082


## TopPopRecommender baseline

We can proceed to look at the precion @ 5 and @ 10 on the whole validation set.

In [None]:
print(f"Precision @ 5: {precision_at_k(validation_set, top_pop_recsys, k=5, relevance_threshold=5, verbose=0):.4f}")
print(f"Precision @ 10: {precision_at_k(validation_set, top_pop_recsys, k=10, relevance_threshold=5, verbose=0):.4f}")

Precision @ 5: 0.0401
Precision @ 10: 0.0351


## SVD model - with Surprise libray

In [10]:
# Load the data into a Surprise Dataset object
reader = Reader(rating_scale=(1, 10))
surprise_train_set = Dataset.load_from_df(
    train_set[['UserID', 'id', 'Ratings']],
    reader
).build_full_trainset()

surprise_validation_set = Dataset.load_from_df(
    validation_set[['UserID', 'id', 'Ratings']],
    reader
).build_full_trainset().build_testset()

# Use SVD algorithm to train the model
# use surprise random predictor
algo = surprise.SVD()

# Predict a movie with the trained model
algo.fit(surprise_train_set)
predictions = algo.test(surprise_validation_set)

accuracy.rmse(predictions)
accuracy.fcp(predictions)

RMSE: 1.4184
FCP:  0.6975


0.6975303197451629

We create the recommender object to encapsulate the logic of the model trained with the surprise library.

In [11]:
class SurpriseModel:
    """
    A class to wrap a recommendation model and provide movie recommendations.

    This class uses a recommendation model (e.g., from the `surprise` library)
    to generate movie recommendations for a given user. It also requires a movie
    catalog, which is a list of all movies in the trainset.

    Attributes:
    -----------
    model : object
        The recommendation model used to predict ratings. It should have a
        `predict(user_id, item_id)` method.

    movie_catalog : list
        A list of all movie IDs available for recommendation. The model will
        generate predictions for these movies.

    Methods:
    --------
    recommend(user_id, k):
        Returns the top `k` movie recommendations for the specified user.
    """

    def __init__(self, model, train_set, relevance_threshold=5):
        self.model = model
        self.train_set = train_set
        self.movie_catalog = train_set['id'].unique()
        self.user_profile = train_set.groupby('UserID')['id'].apply(list)
        self.relevance_threshold = relevance_threshold

    def recommend(self, user_id, k):
        # Get a list of all the movies the user has not rated
        predictions = [self.model.predict(user_id, movie_id) for movie_id in self.movie_catalog if movie_id not in self.user_profile[user_id]]

        # Sort the predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get the top
        top_recommendations = [prediction.iid for prediction in predictions[:k] if prediction.est > 5]

        return top_recommendations

In [12]:
def user_rated_movies_list_print(user, users_watched_list):
    """
    Prints a list of movies rated by a specific user, sorted by rating in descending order.

    :param int user: The UserID of the user whose rated movies are to be printed.
    :param pandas.DataFrame users_watched_list: DataFrame containing user-movie interactions, including 'UserID', 'id' (movie ID), 'title', and 'Ratings'.

    :return: None
    """
    # Print the user rated movies
    movies = users_watched_list.query('UserID == @user').sort_values(by='Ratings', ascending=False)
    print(f"User {user} has rated {len(movies)} movies:")
    print('\n'.join(f"·\t{movie['title']}\t({movie['Ratings']})" for _, movie in movies.iterrows()))

In [13]:
surprise_model = SurpriseModel(algo, train_set)

user_rated_movies_list_print(116, movie_user_data)
res = surprise_model.recommend(116, 10)

print('\nSurprise Recommendations:')
print("\n".join(f"{i + 1}.\t{tmdb_movies[tmdb_movies['id'] == id]['title'].values[0]}" for i, id in enumerate(res)))
print("\n")

User 116 has rated 55 movies:
·	Interstellar	(10)
·	Beasts of No Nation	(10)
·	Mad Max: Fury Road	(10)
·	Baby Driver	(10)
·	Arrival	(9)
·	Blade Runner 2049	(9)
·	Pirates of the Caribbean: Dead Men Tell No Tales	(9)
·	Captain Fantastic	(9)
·	Captain America: Civil War	(9)
·	Logan	(9)
·	Jumanji: Welcome to the Jungle	(9)
·	Zootopia	(9)
·	Fantastic Beasts and Where to Find Them	(9)
·	Spotlight	(9)
·	Three Billboards Outside Ebbing, Missouri	(9)
·	The Imitation Game	(9)
·	John Wick: Chapter 2	(8)
·	Wonder	(8)
·	The Shape of Water	(8)
·	The Killing of a Sacred Deer	(8)
·	Sicario	(8)
·	Brooklyn	(8)
·	Child 44	(8)
·	Creed	(8)
·	Murder on the Orient Express	(8)
·	The Theory of Everything	(8)
·	Manchester by the Sea	(8)
·	Love, Rosie	(8)
·	Spider-Man: Homecoming	(8)
·	Deepwater Horizon	(8)
·	Coco	(7)
·	Loving	(7)
·	Sing	(7)
·	The Disaster Artist	(7)
·	Logan Lucky	(7)
·	I, Tonya	(7)
·	Dunkirk	(7)
·	Silence	(7)
·	Thor: Ragnarok	(7)
·	The Hitman's Bodyguard	(7)
·	Wonder Woman	(7)
·	Jason Bourne	(7

In [14]:
print(f"\nPrecision @ 5: {precision_at_k(validation_set, surprise_model, k=5, relevance_threshold=5, users_to_monitor=100):.4f}")

  100 users processed in  0m  9s. Estimated time to completion:  5m  5s
  200 users processed in  0m 16s. Estimated time to completion:  4m 40s
  300 users processed in  0m 25s. Estimated time to completion:  4m 31s
  400 users processed in  0m 32s. Estimated time to completion:  4m 15s
  500 users processed in  0m 39s. Estimated time to completion:  4m  6s
  600 users processed in  0m 46s. Estimated time to completion:  3m 53s
  700 users processed in  0m 54s. Estimated time to completion:  3m 47s
  800 users processed in  1m  1s. Estimated time to completion:  3m 36s
  900 users processed in  1m  9s. Estimated time to completion:  3m 29s
 1000 users processed in  1m 16s. Estimated time to completion:  3m 19s
 1100 users processed in  1m 23s. Estimated time to completion:  3m 11s
 1200 users processed in  1m 30s. Estimated time to completion:  3m  2s
 1300 users processed in  1m 38s. Estimated time to completion:  2m 55s
 1400 users processed in  1m 44s. Estimated time to completion: 

In [15]:
print(f"\nPrecision @ 10: {precision_at_k(validation_set, surprise_model, k=10, relevance_threshold=5, users_to_monitor=100):.4f}")

  100 users processed in  0m  8s. Estimated time to completion:  4m 33s
  200 users processed in  0m 16s. Estimated time to completion:  4m 29s
  300 users processed in  0m 23s. Estimated time to completion:  4m 16s
  400 users processed in  0m 32s. Estimated time to completion:  4m 14s
  500 users processed in  0m 38s. Estimated time to completion:  3m 59s
  600 users processed in  0m 47s. Estimated time to completion:  3m 55s
  700 users processed in  0m 54s. Estimated time to completion:  3m 46s
  800 users processed in  1m  2s. Estimated time to completion:  3m 38s
  900 users processed in  1m  9s. Estimated time to completion:  3m 27s
 1000 users processed in  1m 17s. Estimated time to completion:  3m 21s
 1100 users processed in  1m 23s. Estimated time to completion:  3m 11s
 1200 users processed in  1m 31s. Estimated time to completion:  3m  4s
 1300 users processed in  1m 37s. Estimated time to completion:  2m 54s
 1400 users processed in  1m 45s. Estimated time to completion: 

# CB Recommender System

##  Prepare the data for a simple CB recommender system

In [16]:
data_movies.info()
data_movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7252 entries, 0 to 7251
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   MovieID       7252 non-null   int64 
 1   Genre         7252 non-null   object
 2   Title         7252 non-null   object
 3   Release_year  7252 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 226.8+ KB


Unnamed: 0,MovieID,Genre,Title,Release_year
0,315642,"[Action, Crime, Drama, Mystery, Thriller]",Wazir,2016
1,337926,[Biography],"Chatô, O Rei do Brasil",2015
2,339736,[Horror],The Evil Within,2017
3,365907,"[Action, Crime, Drama, Thriller]",A Walk Among the Tombstones,2014
4,369610,"[Action, Adventure, Sci-Fi]",Jurassic World,2015


In [17]:
tmdb_movies.info()
tmdb_movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4524 entries, 0 to 4523
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4524 non-null   int64  
 1   title                 4524 non-null   object 
 2   runtime               4524 non-null   int64  
 3   original_language     4524 non-null   object 
 4   popularity            4524 non-null   float64
 5   budget                4524 non-null   int64  
 6   production_companies  4524 non-null   object 
 7   production_countries  4524 non-null   object 
 8   actors                4524 non-null   object 
 9   actors_popularity     4524 non-null   object 
 10  directors             4524 non-null   object 
 11  writers               4524 non-null   object 
dtypes: float64(1), int64(3), object(8)
memory usage: 424.2+ KB


Unnamed: 0,id,title,runtime,original_language,popularity,budget,production_companies,production_countries,actors,actors_popularity,directors,writers
0,275269,Wazir,103,hi,7.585,5200000,"[Getaway Films Private Limited, Vinod Chopra F...",[India],"[Amitabh Bachchan, Farhan Akhtar, Aditi Rao Hy...","[28.096, 15.315, 10.861, 24.625, 8.707, 13.033...",[Bejoy Nambiar],"[Abhijeet Deshpande, Vikram Chandra, Abhijat J..."
1,444193,The Evil Within,98,en,12.447,4000000,"[Writers Studio, The, Supernova LLC]",[United States of America],"[Frederick Koehler, Sean Patrick Flanery, Bria...","[11.371, 15.647, 22.645, 65.655, 13.096, 2.956...",[Jim Simone],"[Andrew Getty, Andrew Getty, Robert Stark Hick..."
2,169917,A Walk Among the Tombstones,114,en,40.245,28000000,"[Traveling Picture Show Company, Jersey Films,...",[United States of America],"[Liam Neeson, Dan Stevens, David Harbour, Boyd...","[75.3, 57.58, 38.414, 25.569, 12.312, 5.169, 1...","[Renee Burke, Stephen Apicella, Justin Ritson,...","[Scott Frank, Scott Frank, Lawrence Block, Mar..."
3,135397,Jurassic World,124,en,86.419,150000000,"[Amblin Entertainment, Universal Pictures]",[United States of America],"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...","[53.735, 28.139, 19.374, 27.158, 41.574, 27.85...","[Steven Spielberg, Colin Trevorrow, Frank Mars...","[Amanda Silver, Rick Jaffa, Michael Crichton, ..."
4,326285,American Pastoral,108,en,11.251,10000000,"[TIK Films, Lakeshore Entertainment, Lionsgate]","[Hong Kong, United States of America]","[Ewan McGregor, Jennifer Connelly, Dakota Fann...","[45.045, 85.93, 50.103, 29.583, 10.107, 7.021,...","[Nilo Otero, Wilma Garscadden-Gahret, Richard ...","[John Romano, Philip Roth, Susan Kim, Josh Sin..."


In [18]:
tmdb_imdb_association.info()
tmdb_imdb_association.head()

<class 'pandas.core.frame.DataFrame'>
Index: 4524 entries, 0 to 4523
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   tmdb_id  4524 non-null   int64
 1   imdb_id  4524 non-null   int64
dtypes: int64(2)
memory usage: 106.0 KB


Unnamed: 0,tmdb_id,imdb_id
0,275269,315642
1,444193,339736
2,169917,365907
3,135397,369610
4,326285,376479


In [19]:
# Merge data movies with tmdb_associations
data_movies = pd.merge(data_movies, tmdb_imdb_association, left_on='MovieID', right_on='imdb_id').rename(
    columns={'tmdb_id': 'id'}).drop(columns=['MovieID', 'imdb_id'])

data_movies.info()
data_movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4322 entries, 0 to 4321
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Genre         4322 non-null   object
 1   Title         4322 non-null   object
 2   Release_year  4322 non-null   int64 
 3   id            4322 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 135.2+ KB


Unnamed: 0,Genre,Title,Release_year,id
0,"[Action, Crime, Drama, Mystery, Thriller]",Wazir,2016,275269
1,[Horror],The Evil Within,2017,444193
2,"[Action, Crime, Drama, Thriller]",A Walk Among the Tombstones,2014,169917
3,"[Action, Adventure, Sci-Fi]",Jurassic World,2015,135397
4,"[Crime, Drama]",American Pastoral,2016,326285


In [20]:
# Merge tmdb_movies_MovieID and data_movies to get the genres
data_movies = pd.merge(data_movies, tmdb_movies, on='id').drop('Title', axis=1)

data_movies.info()
data_movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4322 entries, 0 to 4321
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Genre                 4322 non-null   object 
 1   Release_year          4322 non-null   int64  
 2   id                    4322 non-null   int64  
 3   title                 4322 non-null   object 
 4   runtime               4322 non-null   int64  
 5   original_language     4322 non-null   object 
 6   popularity            4322 non-null   float64
 7   budget                4322 non-null   int64  
 8   production_companies  4322 non-null   object 
 9   production_countries  4322 non-null   object 
 10  actors                4322 non-null   object 
 11  actors_popularity     4322 non-null   object 
 12  directors             4322 non-null   object 
 13  writers               4322 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 472.8+ KB


Unnamed: 0,Genre,Release_year,id,title,runtime,original_language,popularity,budget,production_companies,production_countries,actors,actors_popularity,directors,writers
0,"[Action, Crime, Drama, Mystery, Thriller]",2016,275269,Wazir,103,hi,7.585,5200000,"[Getaway Films Private Limited, Vinod Chopra F...",[India],"[Amitabh Bachchan, Farhan Akhtar, Aditi Rao Hy...","[28.096, 15.315, 10.861, 24.625, 8.707, 13.033...",[Bejoy Nambiar],"[Abhijeet Deshpande, Vikram Chandra, Abhijat J..."
1,[Horror],2017,444193,The Evil Within,98,en,12.447,4000000,"[Writers Studio, The, Supernova LLC]",[United States of America],"[Frederick Koehler, Sean Patrick Flanery, Bria...","[11.371, 15.647, 22.645, 65.655, 13.096, 2.956...",[Jim Simone],"[Andrew Getty, Andrew Getty, Robert Stark Hick..."
2,"[Action, Crime, Drama, Thriller]",2014,169917,A Walk Among the Tombstones,114,en,40.245,28000000,"[Traveling Picture Show Company, Jersey Films,...",[United States of America],"[Liam Neeson, Dan Stevens, David Harbour, Boyd...","[75.3, 57.58, 38.414, 25.569, 12.312, 5.169, 1...","[Renee Burke, Stephen Apicella, Justin Ritson,...","[Scott Frank, Scott Frank, Lawrence Block, Mar..."
3,"[Action, Adventure, Sci-Fi]",2015,135397,Jurassic World,124,en,86.419,150000000,"[Amblin Entertainment, Universal Pictures]",[United States of America],"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...","[53.735, 28.139, 19.374, 27.158, 41.574, 27.85...","[Steven Spielberg, Colin Trevorrow, Frank Mars...","[Amanda Silver, Rick Jaffa, Michael Crichton, ..."
4,"[Crime, Drama]",2016,326285,American Pastoral,108,en,11.251,10000000,"[TIK Films, Lakeshore Entertainment, Lionsgate]","[Hong Kong, United States of America]","[Ewan McGregor, Jennifer Connelly, Dakota Fann...","[45.045, 85.93, 50.103, 29.583, 10.107, 7.021,...","[Nilo Otero, Wilma Garscadden-Gahret, Richard ...","[John Romano, Philip Roth, Susan Kim, Josh Sin..."


In [21]:
# We can create a single feature vector containing all features
selected_features = [
    'Genre', 'production_companies', 'actors', 'directors', 'writers', 'production_countries', 'original_language'
]

# Handling missing values and converting to strings
data_movies[selected_features] = (
    data_movies[selected_features]
    .fillna('')
    .applymap(lambda x: ' '.join(x if isinstance(x, list) else [str(x)]))
)

# Combining selected features into a single feature vector
# combined_features keeps a comprehensive representation of each movie's characteristics
# expand the features that are lists in string
combined_features = data_movies[selected_features].agg(' '.join, axis=1)
combined_features

  .applymap(lambda x: ' '.join(x if isinstance(x, list) else [str(x)]))


Unnamed: 0,0
0,Action Crime Drama Mystery Thriller Getaway Fi...
1,"Horror Writers Studio, The Supernova LLC Frede..."
2,Action Crime Drama Thriller Traveling Picture ...
3,Action Adventure Sci-Fi Amblin Entertainment U...
4,Crime Drama TIK Films Lakeshore Entertainment ...
...,...
4317,Documentary Edward Leung Nora Lam Hong Kong cn
4318,Short Comedy Bryan Michael Nunez Yoshi Sudars...
4319,Comedy Soo Film Plus M Entertainment Hong Film...
4320,Documentary Short Nicky Maas en


In [22]:
combined_features[0]

'Action Crime Drama Mystery Thriller Getaway Films Private Limited Vinod Chopra Films Rajkumar Hirani Films Vidhu Vinod Chopra Productions Amitabh Bachchan Farhan Akhtar Aditi Rao Hydari John Abraham Neil Nitin Mukesh Manav Kaul Prakash Belawadi Murali Sharma Nishigandha Wad Avtar Gill Seema Pahwa Vaidehi Parshurami Bejoy Nambiar Abhijeet Deshpande Vikram Chandra Abhijat Joshi Rajkumar Hirani Vidhu Vinod Chopra Supriya Kelkar Vidhu Vinod Chopra Suketu Mehta Vidhu Vinod Chopra Abhijat Joshi India hi'

In [23]:
# Converting Text Data to Feature Vectors
# Creating an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Converting the combined features into feature vectors
feature_vectors = vectorizer.fit_transform(combined_features)

# Printing the feature vectors
print("Feature vectors:\n", feature_vectors)

Feature vectors:
   (0, 20721)	0.05303185850307838
  (0, 22064)	0.04525936776543243
  (0, 31798)	0.09380594455882828
  (0, 46482)	0.11516289535806304
  (0, 24985)	0.09853895673044955
  (0, 46632)	0.09853895673044955
  (0, 23642)	0.15361636695025135
  (0, 253)	0.20600776523997102
  (0, 8783)	0.09380594455882828
  (0, 50624)	0.08014308060813509
  (0, 12202)	0.09032441809271238
  (0, 254)	0.10978244650833022
  (0, 34178)	0.09853895673044955
  (0, 4679)	0.10596495729690582
  (0, 36873)	0.11516289535806304
  (0, 49903)	0.11516289535806304
  (0, 36476)	0.09380594455882828
  (0, 43151)	0.09032441809271238
  (0, 17910)	0.07541006843651381
  (0, 3358)	0.10978244650833022
  (0, 51106)	0.11516289535806304
  (0, 35108)	0.11516289535806304
  (0, 43797)	0.06260361902032471
  (0, 33716)	0.08600612153628291
  (0, 4693)	0.09676701923574858
  :	:
  (4321, 23557)	0.10577819441654004
  (4321, 51412)	0.13662012977858856
  (4321, 11635)	0.12146142412543877
  (4321, 45716)	0.16897516846983865
  (4321, 31890)

In [24]:
feature_vectors.shape

(4322, 53904)

## CB using the sklearn library

In [25]:
# Create the similarity matrix
# Getting the similarity scores using cosine similarity
cosine_sim = cosine_similarity(feature_vectors)
print(type(cosine_sim))

print(cosine_sim)

# Printing the shape of the similarity matrix
print("Shape of the similarity matrix:", cosine_sim.shape)

<class 'numpy.ndarray'>
[[1.         0.         0.02358518 ... 0.         0.         0.00487202]
 [0.         1.         0.02078431 ... 0.         0.0032264  0.02914208]
 [0.02358518 0.02078431 1.         ... 0.01106478 0.00180778 0.01485565]
 ...
 [0.         0.         0.01106478 ... 1.         0.         0.00204075]
 [0.         0.0032264  0.00180778 ... 0.         1.         0.00402293]
 [0.00487202 0.02914208 0.01485565 ... 0.00204075 0.00402293 1.        ]]
Shape of the similarity matrix: (4322, 4322)


In [26]:
def most_similar_word(x, l):
    """
    Finds the word in the list `l` that is most similar to the string `x` using the LCS algorithm.

    :param x: The string to compare (user's input movie title)
    :param l: List of strings to compare against (list of movie titles)
    :return: The word from the list `l` that has the highest similarity score to `x`
    """

    # Function to calculate the LCS ratio similarity between two strings
    def lcs_similarity(w1, w2):
        """
        Calculates the length of the longest common subsequence (LCS) between two strings.

        :param w1: First string (e.g., user's movie title)
        :param w2: Second string (e.g., movie title from the list)
        :return: Length of the LCS between the two strings
        """
        m = len(w1)
        n = len(w2)
        L = [[0 for x in range(n + 1)] for x in range(m + 1)]

        # Building the matrix in bottom-up way
        for i in range(m + 1):
            for j in range(n + 1):
                if i == 0 or j == 0:
                    L[i][j] = 0
                elif w1[i - 1] == w2[j - 1]:
                    L[i][j] = L[i - 1][j - 1] + 1
                else:
                    L[i][j] = max(L[i - 1][j], L[i][j - 1])

        index = L[m][n]

        lcs_algo = [""] * (index + 1)
        lcs_algo[index] = ""

        i = m
        j = n
        while i > 0 and j > 0:
            if w1[i - 1] == w2[j - 1]:
                lcs_algo[index - 1] = w1[i - 1]
                i -= 1
                j -= 1
                index -= 1
            elif L[i - 1][j] > L[i][j - 1]:
                i -= 1
            else:
                j -= 1

        return len(lcs_algo)

    # Find the word in the list with the highest similarity score
    close_match = max(l, key=lambda t: lcs_similarity(x.lower(), t.lower()))
    return close_match

In [27]:
def cb_similar_movies_recommendation(similarity_mtrx, movies, suggest_n=5):
    """
    Recommends a list of movies similar to the user's favorite movie using content-based filtering.

    :param numpy.ndarray of float similarity_mtrx: 2D matrix representing similarity scores between all movies
    :param pandas.DataFrame movies: DataFrame containing movie data, including titles
    :param int suggest_n: Number of similar movies to recommend (default is 5)
    :return: Tuple containing the list of recommended movie titles and the movie closest to the user's input
    """
    # CB Movie Recommendation System

    # Prompting the user to enter their favorite movie name
    title = input('Enter your favorite movie name: ')

    # Creating a list with all the movie titles given in the dataset
    titles = movies['title'].tolist()

    # Finding the close match for the movie name given by the user

    movie = most_similar_word(title, titles)
    print(f'The closest match in the database to your favorite movie is: {movie}')

    # Finding the index of the movie with the closest match title
    ith_movie = movies[movies['title'] == movie].index.values[0]

    # Getting a list of similar movies based on similarity scores
    sim_scores = list(enumerate(similarity_mtrx[ith_movie]))

    # Sorting the movies based on their similarity score
    sim_movies = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top similar movies
    print('\nWe recommend you the following movies:')
    to_suggest = []
    for i in range(1, suggest_n + 1):
        ith_movie = sim_movies[i][0]
        sim_movie = movies.iloc[ith_movie]['title']
        to_suggest.append(sim_movie)
        print(f"{i}. {sim_movie}")

    return to_suggest, movie

In [28]:
recommended_similar_movies, from_movie = cb_similar_movies_recommendation(
    similarity_mtrx=cosine_sim,
    movies=data_movies,
    suggest_n=10
)

Enter your favorite movie name: Wonder wmn
The closest match in the database to your favorite movie is: Wonder Woman

We recommend you the following movies:
1. Justice League
2. 300: Rise of an Empire
3. Batman v Superman: Dawn of Justice
4. Cold Moon
5. Mission: Impossible - Rogue Nation
6. Ant-Man
7. Edge of Tomorrow
8. Suicide Squad
9. Captain America: The Winter Soldier
10. Darkest Hour


# Hybrid Recommender - Two Recommender Ensemble

## CF recommender extension for hybrid systems

In [29]:
def recommend_movies(user, model, users_watched_list, suggest_n=10):
    """
    Recommends movies to a user based on collaborative filtering (CF) predictions from a model.

    :param int user: The UserID of the user for whom the movie recommendations are to be generated.
    :param surprise.SVD model: Trained recommendation model (e.g., using collaborative filtering) that predicts user ratings for movies.
    :param pandas.DataFrame users_watched_list: DataFrame containing user-movie interactions, including 'UserID', 'id' (movie ID), and 'title'.
    :param int suggest_n: Number of top recommendations to return (default is 10).
    :return: A list of recommended movie titles for the user.
    """
    # Get a list of all the movies the user has not rated
    movies = users_watched_list['id'].unique()
    watched = users_watched_list[users_watched_list['UserID'] == user]['id'].unique()
    to_watch = list(set(movies) - set(watched))

    # Predict the ratings for the new movies
    estimated_ratings = [model.predict(user, movie) for movie in to_watch]

    # Sort the predictions by estimated rating
    estimated_ratings.sort(key=lambda i: i.est, reverse=True)

    # Get the top 10 recommendations
    top_recommendations = [
        (p.iid, users_watched_list[users_watched_list['id'] == p.iid]['title'].values[0], p.est)
        for p in estimated_ratings[:suggest_n]
    ][:suggest_n]


    return top_recommendations


In [30]:
user_rated_movies_list_print(39, movie_user_data)
print("\nFor this user we recommend:")

recommend_movies(
    user=39,
    model=algo,
    users_watched_list=movie_user_data,
    suggest_n=10
)

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

For this user we recommend:


[(297222, 'PK', 9.7842902350962),
 (277216, 'Straight Outta Compton', 9.71336865379995),
 (355020, "Winter on Fire: Ukraine's Fight for Freedom", 9.632737966291382),
 (354912, 'Coco', 9.517393373492881),
 (359940, 'Three Billboards Outside Ebbing, Missouri', 9.504201680054646),
 (264644, 'Room', 9.488982490021302),
 (273248, 'The Hateful Eight', 9.487950321332093),
 (374720, 'Dunkirk', 9.456064678498166),
 (293660, 'Deadpool', 9.44545088763464),
 (269149, 'Zootopia', 9.438927833733043)]

## CB recommender extension for hybrid systems

### Auxiliary functions for CB


In [31]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4322 entries, 0 to 4321
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Genre                 4322 non-null   object 
 1   Release_year          4322 non-null   int64  
 2   id                    4322 non-null   int64  
 3   title                 4322 non-null   object 
 4   runtime               4322 non-null   int64  
 5   original_language     4322 non-null   object 
 6   popularity            4322 non-null   float64
 7   budget                4322 non-null   int64  
 8   production_companies  4322 non-null   object 
 9   production_countries  4322 non-null   object 
 10  actors                4322 non-null   object 
 11  actors_popularity     4322 non-null   object 
 12  directors             4322 non-null   object 
 13  writers               4322 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 472.8+ KB


In [32]:
# Function to get movie recommendations based on a single movie
def get_movie_recommendations_by_movie(movie, similarity_mtrx, movies, suggest_n=10):

    movies_idx = movies[movies['id'] == movie].index[0]
    # Getting a list of similar movies based on similarity scores
    similarity_score = similarity_mtrx[movies_idx]

    # Sorting the movies based on their similarity score
    sorted_similar_movies = np.argsort(similarity_score)[::-1]

    # Get top similar movies (ignoring the first one since it's the input movie)
    recommended_movies = [
        (movies.iloc[movie_index]['id'],
        movies.iloc[movie_index]['title'],
        similarity_score[movie_index])
        for movie_index in sorted_similar_movies[1:suggest_n+1]
    ]

    return recommended_movies

In [33]:
interstellar_id = data_movies.loc[data_movies['title'] =='Interstellar']['id'].values[0]
get_movie_recommendations_by_movie(interstellar_id, cosine_sim, data_movies)

[(374720, 'Dunkirk', 0.24475472136975657),
 (394692, 'Paint It Black', 0.13747680449673788),
 (336592, 'The Science of Interstellar', 0.1324836959081931),
 (395991, 'Only the Brave', 0.12499001284869053),
 (335988, 'Transformers: The Last Knight', 0.11380074790280723),
 (209112, 'Batman v Superman: Dawn of Justice', 0.10186073643266119),
 (351964, 'The Escort', 0.10138951638910704),
 (87101, 'Terminator Genisys', 0.10126362337013205),
 (293632, 'Bro, What Happened?', 0.09725095972045848),
 (241239, 'A Most Violent Year', 0.09045245130035803)]

In [34]:
def user_rated_movies_list(user, users_watched_list):
    """
    Retrieves a list of movies rated by a specific user along with their corresponding ratings.

    :param int user: The UserID of the user whose rated movies are to be retrieved
    :param pandas.DataFrame users_watched_list: DataFrame containing user ratings and watched movies, including 'UserID', 'id' (movie ID), 'title', and 'Ratings'
    :return: A tuple containing two lists - the first list consists of movie titles, and the second list contains the corresponding ratings
    """
    # Get the movies rated by the user
    movies = []

    # Get the unique movie IDs watched by the user
    watched = users_watched_list[users_watched_list['UserID'] == user]['id'].unique()

    # Retrieve movie titles and ratings for each movie the user has watched
    movies = [
        (users_watched_list[users_watched_list['id'] == movie]['id'].values[0],
         users_watched_list[users_watched_list['id'] == movie]['title'].values[0],
         users_watched_list[users_watched_list['id'] == movie]['Ratings'].values[0])
        for movie in watched
    ]

    return movies

### CB user-based recommender system

In [35]:
def cb_recommendation_user_based(movie_ids, ratings, similarity_mtrx, movies, suggest_n=10, correcting_factor=0.1):

    from collections import defaultdict

    # Aggregate recommendations for all rated movies by the user
    movie_recommendations = defaultdict(list)  # Dictionary to store all movie recommendations and scores
    weighted_scores = defaultdict(float)       # To keep track of weighted scores of recommendations
    scores = {}

    # Loop over each rated movie and its rating
    for movie_id, rating in zip(movie_ids, ratings):
        recommendations = get_movie_recommendations_by_movie(
            movie_id, similarity_mtrx, movies, suggest_n
        )  # Get top recommendations for the movie

        # Add recommendations to dictionary, adjusting score by user's rating
        # recommendations has the structure: (movie_id, movie_title, similarity_score)
        for rec_movie, _, score in recommendations:
          if rec_movie not in movie_ids:
            movie_recommendations[rec_movie].append((score, rating))

            # Count how many times the same movie is recommended
            if rec_movie not in scores:
                scores[rec_movie] = score
                weighted_scores[rec_movie] = score*rating
            else:
                scores[rec_movie] += score
                weighted_scores[rec_movie] += score*rating

    # Compute final weighted score for each movie
    final_recommendations = []

    for movie, _ in scores.items():
        avg_rating_weight = weighted_scores[movie] / (scores[movie] + correcting_factor)
        final_recommendations.append((movie, movies.loc[movies['id'] == movie]['title'].values[0], avg_rating_weight))

    # Sort the recommendations based on the final weighted score
    final_recommendations.sort(key=lambda x: x[2], reverse=True)

    # Extract the top 'k' recommendations
    top_recommended_movies = final_recommendations[:suggest_n]

    return top_recommended_movies

In [36]:
user_rated_movies_list_print(39, movie_user_data)
print("\nFor this user we recommend:")

movies = user_rated_movies_list(39, movie_user_data)
movie_ids = [id for id, _, _ in movies]
ratings = [rating for _, _, rating in movies]

cb_recommendation_user_based(
    movie_ids,
    ratings,
    cosine_sim,
    data_movies,
    suggest_n=10,
    correcting_factor=0.2
)

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

For this user we recommend:


[(343668, 'Kingsman: The Golden Circle', 6.513718312736541),
 (353616, 'Pitch Perfect 3', 5.664412343071305),
 (335988, 'Transformers: The Last Knight', 5.635867690252383),
 (325133, 'Neighbors 2: Sorority Rising', 5.559913151358221),
 (307663, 'Vice', 5.530505628748193),
 (374720, 'Dunkirk', 5.503139362207565),
 (371638, 'The Disaster Artist', 5.213163192440067),
 (131634, 'The Hunger Games: Mockingjay - Part 2', 5.198698295860009),
 (346104, 'Faberge: A Life of Its Own', 5.189940250990407),
 (87101, 'Terminator Genisys', 5.180436887591047)]

In [37]:
class ContentBasedRecommender:

    def __init__(self, flattened_URM, similarity_matrix, data_movies, correcting_factor=None):
        self.movie_user_data = movie_user_data
        self.similarity_matrix = similarity_matrix
        self.flattened_URM = flattened_URM
        self.data_movies = data_movies
        if correcting_factor is None:
            self.correcting_factor = 0.2
        else:
            self.correcting_factor = correcting_factor

    def recommend(self, user_id, k=5):
        # Get the movies rated by the user
        movie_user_indx = self.flattened_URM[self.flattened_URM['UserID'] == user_id].index

        user_rated_movies_ids = self.flattened_URM.loc[movie_user_indx]['id']
        user_rated_movies_rating = self.flattened_URM.loc[movie_user_indx]['Ratings']

        # Get the recommended movies
        user_movies = cb_recommendation_user_based(
            user_rated_movies_ids,
            user_rated_movies_rating,
            self.similarity_matrix,
            self.data_movies,
            suggest_n=k,
            correcting_factor=self.correcting_factor
        )

        movies_ids = [id for id, _, _ in user_movies]

        return movies_ids

In [38]:
cb_recommender = ContentBasedRecommender(train_set, cosine_sim, data_movies)

In [None]:
p5 = precision_at_k(
        validation_set,
        cb_recommender,
        k=5,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 5: {p5:.4f}")

100 users processed in 34s. Estimated time to completion: 19m 40s
200 users processed in 67s. Estimated time to completion: 18m 60s
300 users processed in 98s. Estimated time to completion: 17m 58s
400 users processed in 130s. Estimated time to completion: 17m 29s
500 users processed in 157s. Estimated time to completion: 16m 22s
600 users processed in 190s. Estimated time to completion: 15m 57s
700 users processed in 219s. Estimated time to completion: 15m 12s
800 users processed in 244s. Estimated time to completion: 14m 23s
900 users processed in 268s. Estimated time to completion: 13m 33s
1000 users processed in 292s. Estimated time to completion: 12m 45s
1100 users processed in 317s. Estimated time to completion: 12m 7s
1200 users processed in 340s. Estimated time to completion: 11m 27s
1300 users processed in 361s. Estimated time to completion: 10m 45s
1400 users processed in 385s. Estimated time to completion: 10m 12s
1500 users processed in 402s. Estimated time to completion: 9

In [None]:
p10 = precision_at_k(
        validation_set,
        cb_recommender,
        k=10,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 10: {p10:.4f}")

100 users processed in 51 seconds. Estimated time to completion: 30min - 15s
100 users processed in 50 seconds. Estimated time to completion: 28min - 60s
100 users processed in 47 seconds. Estimated time to completion: 27min - 25s
100 users processed in 50 seconds. Estimated time to completion: 26min - 41s
100 users processed in 42 seconds. Estimated time to completion: 25min - 1s
100 users processed in 50 seconds. Estimated time to completion: 24min - 22s
100 users processed in 44 seconds. Estimated time to completion: 23min - 17s
100 users processed in 40 seconds. Estimated time to completion: 22min - 1s
100 users processed in 38 seconds. Estimated time to completion: 20min - 49s
100 users processed in 39 seconds. Estimated time to completion: 19min - 46s
100 users processed in 38 seconds. Estimated time to completion: 18min - 44s
100 users processed in 36 seconds. Estimated time to completion: 17min - 41s
100 users processed in 33 seconds. Estimated time to completion: 16min - 39s
1

### CB function with a filtering on best rating

In [39]:
def cb_recommendation_user_based_rating_filtering(movie_ids, ratings, similarity_mtrx, movies, suggest_n=10):
    from collections import defaultdict

    # Aggregate recommendations for all rated movies by the user
    final_scores = defaultdict(float)  # To keep track of weighted scores of recommendations
    avg_final_score = defaultdict(float)
    repeated_counts = {}

    # Loop over each rated movie and its rating
    for idx, (movie, rating) in enumerate(zip(movie_ids, ratings)):
        if rating > 7:
            recommendations = get_movie_recommendations_by_movie(
                movie=movie,
                similarity_mtrx=similarity_mtrx,
                movies=movies,
                suggest_n=suggest_n * 2 # TODO: why *2?
            )

            for rec_movie, _, score in recommendations:
                if rec_movie not in movie_ids:
                    # Compute the average weighted score iteratively
                    if rec_movie not in repeated_counts:
                        repeated_counts[rec_movie] = 1
                        final_scores[rec_movie] = score
                    else:
                        repeated_counts[rec_movie] += 1
                        final_scores[rec_movie] += score

    # Compute final weighted score for each movie
    final_recommendations = []

    for movie, _ in repeated_counts.items():
        avg_final_score[movie] = final_scores[movie] / repeated_counts[movie]
        final_recommendations.append(
            (movie, movies[movies.id == movie]['title'].values[0] ,avg_final_score[movie])
        )

    # Sort the recommendations based on the final weighted score
    sorted_recommendations = sorted(final_recommendations, key=lambda x: x[2], reverse=True)

    # Extract the top 'k' recommendations
    top_recommended_movies = sorted_recommendations[:suggest_n]

    return top_recommended_movies

In [40]:
user_rated_movies_list_print(39, movie_user_data)
rated_movies = user_rated_movies_list(39, movie_user_data)
print("\nFor this user we recommend:")

rated_movies_ids = [id for id, _, _ in rated_movies]
rated_movies_ratings = [rating for _, _, rating in rated_movies]

cb_recommendation_user_based_rating_filtering(
    rated_movies_ids,
    rated_movies_ratings,
    similarity_mtrx=cosine_sim,
    movies=data_movies,
    suggest_n=10
)

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

For this user we recommend:


[(325133, 'Neighbors 2: Sorority Rising', 0.44014215147460234),
 (343668, 'Kingsman: The Golden Circle', 0.3736771091408539),
 (157350, 'Divergent', 0.30607920067038175),
 (271110, 'Captain America: Civil War', 0.29684892008733865),
 (353616, 'Pitch Perfect 3', 0.26129848091153307),
 (307663, 'Vice', 0.24747790999898805),
 (99861, 'Avengers: Age of Ultron', 0.21345123733751367),
 (102899, 'Ant-Man', 0.20782265947618886),
 (326425, 'Extraction', 0.20013095126749855),
 (296099, 'Vacation', 0.1868063153162769)]

In [41]:
class ContentBasedWithFilteringRecommender:

    def __init__(self, flattened_URM, similarity_matrix, data_movies):
        self.movie_user_data = movie_user_data
        self.similarity_matrix = similarity_matrix
        self.flattened_URM = flattened_URM
        self.data_movies = data_movies


    def recommend(self, user_id, k=5):
        # Get the movies rated by the user
        movie_user_indx = self.flattened_URM[self.flattened_URM['UserID'] == user_id].index

        user_rated_movies_ids = self.flattened_URM.loc[movie_user_indx]['id']
        user_rated_movies_rating = self.flattened_URM.loc[movie_user_indx]['Ratings']

        # Get the recommended movies
        user_movies = cb_recommendation_user_based_rating_filtering(
            user_rated_movies_ids,
            user_rated_movies_rating,
            self.similarity_matrix,
            self.data_movies,
            suggest_n=k,
        )

        movies_ids = [id for id, _, _ in user_movies]

        return movies_ids

In [42]:
cb_w_filter_recommender = ContentBasedWithFilteringRecommender(train_set, cosine_sim, data_movies)

In [43]:
cb_w_filter_recommender.recommend(39, 5)

[325133, 343668, 157350, 271110, 353616]

In [None]:
p5 = precision_at_k(
        validation_set,
        cb_w_filter_recommender,
        k=5,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 5: {p5:.4f}")

100 users processed in 21 seconds. Estimated time to completion: 12min - 18s
100 users processed in 20 seconds. Estimated time to completion: 11min - 34s
100 users processed in 18 seconds. Estimated time to completion: 10min - 46s
100 users processed in 20 seconds. Estimated time to completion: 10min - 30s
100 users processed in 17 seconds. Estimated time to completion: 9min - 56s
100 users processed in 19 seconds. Estimated time to completion: 9min - 37s
100 users processed in 18 seconds. Estimated time to completion: 9min - 13s
100 users processed in 17 seconds. Estimated time to completion: 8min - 46s
100 users processed in 16 seconds. Estimated time to completion: 8min - 20s
100 users processed in 16 seconds. Estimated time to completion: 7min - 55s
100 users processed in 17 seconds. Estimated time to completion: 7min - 34s
100 users processed in 15 seconds. Estimated time to completion: 7min - 10s
100 users processed in 14 seconds. Estimated time to completion: 6min - 46s
100 user

## Mixed Hybrid Recommender System

In [44]:
def ensemble_recommendation_intersection_based(user, model, flattened_URM, similarity_mtrx, movies, users_watched_list, suggest_n=10):
    """
    Recommends movies to a user by combining collaborative filtering (CF) and content-based (CB) recommendations.
    The final list of recommendations is based on the intersection of movies recommended by both methods,
    or a union of the two if no common movies are found.

    :param int user: The UserID of the user for whom movie recommendations are being generated.
    :param surprise.SVD model: Trained CF recommendation model used to predict ratings for movies.
    :param numpy.ndarray of float similarity_mtrx: 2D matrix representing similarity scores between all movies.
    :param pandas.DataFrame movies: DataFrame containing movie data, including titles.
    :param pandas.DataFrame users_watched_list: DataFrame containing user-movie interactions, including 'UserID', 'id' (movie ID), and 'title'.
    :param int suggest_n: Number of top recommendations to return (default is 10).

    :return: A tuple with three lists: 1. A list of movies recommended based on the intersection or union of CF and CB recommendations. 2.
             A List of movie titles recommended by the content-based filtering system. 3. A list of movie titles recommended by the collaborative filtering system.
    """
    cf_recommended_movies = recommend_movies(
        user=user,
        model=model,
        users_watched_list=users_watched_list,
        suggest_n=suggest_n
    )
    # Extract only the ids
    cf_suggestions = [id for id, _, _ in cf_recommended_movies]

    # Get the movies rated by the user
    rated_movie_user_indx = flattened_URM[flattened_URM['UserID'] == user].index
    rated_movies_ids = flattened_URM.loc[rated_movie_user_indx]['id']
    rated_movies_ratings = flattened_URM.loc[rated_movie_user_indx]['Ratings']

    cb_recommended_movies = cb_recommendation_user_based(
        rated_movies_ids,
        rated_movies_ratings,
        similarity_mtrx=similarity_mtrx,
        movies=movies,
        suggest_n=suggest_n
    )
    # Extract only ids
    cb_suggestions = [id for id, _, _ in cb_recommended_movies]

    common_movies = list(set(cf_suggestions) & set(cb_suggestions))
    if len(common_movies) == 0:
        common_movies = list(set(cf_suggestions + cb_suggestions))
        # Order the result by similarity score
        common_movies = sorted(
            common_movies,
            key=lambda t: get_movie_recommendations_by_movie(
                movie=t,
                similarity_mtrx=similarity_mtrx,
                movies=movies,
                suggest_n=1
            )[0][2],
            reverse=True
        )[:suggest_n]
    else:
        if len(common_movies) < suggest_n:
            # Add to the result the union of the two results sorted by similarity
            result_union = list(set(cf_suggestions + cb_suggestions))
            # Order the result by similarity score
            result_union = sorted(
                result_union,
                key=lambda t:
                get_movie_recommendations_by_movie(
                    movie=t,
                    similarity_mtrx=similarity_mtrx,
                    movies=movies,
                    suggest_n=1
                )[0][1],
                reverse=True
            )
            # If in result_union we have values that are also in result we remove them from result_union
            result_union = [item for item in result_union if item not in common_movies]
            common_movies = common_movies + result_union[:suggest_n - len(common_movies)]

    return common_movies, cb_suggestions, cf_suggestions

In [46]:
# Use the random function to run the recommendations for another user id, instead of the user 39 used for our study case and qualitative analysis
# user_id = np.random.choice(movie_user_data['UserID'].unique())
user_id = 39
result, cb, cf = ensemble_recommendation_intersection_based(
    user=user_id,
    model=algo,
    flattened_URM=flattened_URM,
    similarity_mtrx=cosine_sim,
    movies=data_movies,
    users_watched_list=movie_user_data,
    suggest_n=10
)

result_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in result]
cb_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in cb]
cf_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in cf]

user_rated_movies_list_print(user=user_id, users_watched_list=movie_user_data)
print('\nWe recommend:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(result_titles)))
print("\n")
print("Given that the two recommender systems recommended:")
print('CB:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(cb_titles)))
print('\nCF:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(cf_titles)))

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

We recommend:
1.	Dunkirk
2.	PK
3.	Transformers: The Last Knight
4.	Straight Outta Compton
5.	Vice
6.	Winter on Fire: Ukraine's Fight for Freedom
7.	The Hateful Eight
8.	Extraction
9.	Pitch Perfect 3
10.	Neighbors 2: Sorority Risin

In [45]:
class EnsembleRecommender:

    def __init__(self, algo, flattened_URM, similarity_mtrx, data_movies, users_watched_list):
        self.algo = algo
        self.flattened_URM = flattened_URM
        self.similarity_mtrx = similarity_mtrx
        self.data_movies = data_movies
        self.users_watched_list = users_watched_list


    def recommend(self, user_id, k=5):
        # Get the movies rated by the user
        movie_user_indx = self.flattened_URM[self.flattened_URM['UserID'] == user_id].index

        user_rated_movies_ids = self.flattened_URM.loc[movie_user_indx]['id']
        user_rated_movies_rating = self.flattened_URM.loc[movie_user_indx]['Ratings']

        # Get the recommended movies
        result, _, _ = ensemble_recommendation_intersection_based(
            user=user_id,
            model=self.algo,
            flattened_URM=self.flattened_URM,
            similarity_mtrx=self.similarity_mtrx,
            movies=self.data_movies,
            users_watched_list=self.users_watched_list,
            suggest_n=k
        )

        return result

In [None]:
ensemble_recommender = EnsembleRecommender(algo, flattened_URM, cosine_sim, data_movies, movie_user_data)

p5 = precision_at_k(
        validation_set,
        ensemble_recommender,
        k=5,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 5: {p5:.4f}")

  100 users processed in  0m 48s. Estimated time to completion: 27m 51s
  200 users processed in  1m 35s. Estimated time to completion: 27m  7s
  300 users processed in  2m 19s. Estimated time to completion: 25m 36s
  400 users processed in  3m  5s. Estimated time to completion: 24m 46s
  500 users processed in  3m 44s. Estimated time to completion: 23m 21s
  600 users processed in  4m 32s. Estimated time to completion: 22m 49s
  700 users processed in  5m 15s. Estimated time to completion: 21m 53s
  800 users processed in  5m 53s. Estimated time to completion: 20m 45s
  900 users processed in  6m 27s. Estimated time to completion: 19m 33s
 1000 users processed in  7m  4s. Estimated time to completion: 18m 33s
 1100 users processed in  7m 39s. Estimated time to completion: 17m 33s
 1200 users processed in  8m 13s. Estimated time to completion: 16m 37s
 1300 users processed in  8m 48s. Estimated time to completion: 15m 44s
 1400 users processed in  9m 23s. Estimated time to completion: 

## Meta-level Hybrid Recommendation System

In [47]:
def ensemble_recommendation_meta_level(user, model, similarity_mtrx, users_watched_list, movies, suggest_n=10):
    """
    Provides movie recommendations to a user using a meta-level ensemble approach that combines collaborative filtering
    (CF) recommendations with content-based (CB) recommendations.

    :param int user: The UserID of the user for whom recommendations are to be generated.
    :param surprise.SVD model: Trained collaborative filtering recommendation model used to predict ratings for movies.
    :param numpy.ndarray of float similarity_mtrx: 2D matrix representing similarity scores between all movies for content-based recommendations.
    :param pandas.DataFrame users_watched_list: DataFrame containing user-movie interactions, including 'UserID', 'id' (movie ID), and 'title'.
    :param pandas.DataFrame movies: DataFrame containing movie information, including titles.
    :param int suggest_n: Number of top recommendations to return (default is 10).

    :return: A list of tuples where each tuple contains a recommended movie title and its associated score, sorted by score in descending order.
    """
    ids = []
    titles = []
    scores = []

    cf_recs = recommend_movies(
        user=user,
        model=model,
        users_watched_list=users_watched_list,
        suggest_n=suggest_n
    )

    for id, _, _ in cf_recs:
        cb_recs = get_movie_recommendations_by_movie(
            movie=id,
            similarity_mtrx=similarity_mtrx,
            movies=movies,
            suggest_n=suggest_n
        )

        for cb_id, title, score in cb_recs:
            if cb_id not in ids:
                ids.append(cb_id)
                scores.append(score)
                titles.append(title)

    # Zip together scores and results
    recommendations = list(zip(ids, titles, scores))
    # Sort results_scores by score in descending order
    recommendations = sorted(recommendations, key=lambda x: x[2], reverse=True)

    return recommendations[:suggest_n]

In [48]:
user_rated_movies_list_print(user=user_id, users_watched_list=movie_user_data)
print('\nWe recommend:')

ensemble_recommendation_meta_level(
    user=user_id,
    model=algo,
    similarity_mtrx=cosine_sim,
    users_watched_list=movie_user_data,
    movies=data_movies,
    suggest_n=10
)

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

We recommend:


[(275269, 'Wazir', 0.39740042729302605),
 (558144, 'Deadpool: No Good Deed', 0.33325087328022407),
 (157336, 'Interstellar', 0.24475472136975657),
 (293299, 'Feast', 0.2417258056944418),
 (346034, '3 Idiotas', 0.22137579366342477),
 (177572, 'Big Hero 6', 0.20398902567395402),
 (286192, 'Lava', 0.19678050686341975),
 (399106, 'Piper', 0.15171637657727324),
 (421851, 'Michael Moore in TrumpLand', 0.15122778313012983),
 (157832, 'Calvary', 0.15113152172506605)]

In [48]:
class MetaEnsembleRecommender:

    def __init__(self, algo, similarity_mtrx, data_movies, users_watched_list):
        self.algo = algo
        self.flattened_URM = flattened_URM
        self.similarity_mtrx = similarity_mtrx
        self.data_movies = data_movies
        self.users_watched_list = users_watched_list


    def recommend(self, user_id, k=5):
        # Get the movies rated by the user
        movie_user_indx = self.flattened_URM[self.flattened_URM['UserID'] == user_id].index

        user_rated_movies_ids = self.flattened_URM.loc[movie_user_indx]['id']
        user_rated_movies_rating = self.flattened_URM.loc[movie_user_indx]['Ratings']

        # Get the recommended movies
        result = ensemble_recommendation_meta_level(
            user=user_id,
            model=self.algo,
            similarity_mtrx=self.similarity_mtrx,
            movies=self.data_movies,
            users_watched_list=self.users_watched_list,
            suggest_n=k
        )

        ids = [id for id, _, _ in result]
        return ids

In [49]:
meta_ensemble_recommender = MetaEnsembleRecommender(algo, cosine_sim, data_movies, movie_user_data)

In [None]:
p5 = precision_at_k(
        validation_set,
        meta_ensemble_recommender,
        k=5,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 5: {p5:.4f}")

  100 users processed in  0m  7s. Estimated time to completion:  3m 47s
  200 users processed in  0m 13s. Estimated time to completion:  3m 44s
  300 users processed in  0m 19s. Estimated time to completion:  3m 25s
  400 users processed in  0m 24s. Estimated time to completion:  3m 13s
  500 users processed in  0m 31s. Estimated time to completion:  3m 11s
  600 users processed in  0m 36s. Estimated time to completion:  3m  1s
  700 users processed in  0m 43s. Estimated time to completion:  2m 58s
  800 users processed in  0m 48s. Estimated time to completion:  2m 50s
  900 users processed in  0m 54s. Estimated time to completion:  2m 45s
 1000 users processed in  1m  0s. Estimated time to completion:  2m 39s
 1100 users processed in  1m  6s. Estimated time to completion:  2m 31s
 1200 users processed in  1m 13s. Estimated time to completion:  2m 27s
 1300 users processed in  1m 18s. Estimated time to completion:  2m 20s
 1400 users processed in  1m 25s. Estimated time to completion: 

## Hybrid recommender - changing CB adding overview information

Extract the overview from the dataset and create embeddings of the overview, to use it as a feature for the movie similarity. Let's see if this creates a more accurate model.

In [49]:
# We can create a single feature vector containing all features
selected_features = [
    'Genre', 'production_companies', 'actors', 'directors', 'writers', 'production_countries', 'original_language'
]

# Handling missing values and converting to strings
data_movies[selected_features] = (
    data_movies[selected_features]
    .fillna('')
    .applymap(lambda x: ' '.join(x if isinstance(x, list) else [str(x)]))
)

# Combining selected features into a single feature vector
# combined_features keeps a comprehensive representation of each movie's characteristics
# expand the features that are lists in string
combined_features_ovw = data_movies[selected_features].agg(' '.join, axis=1)

# Add movies_attributes['overview'] to combined_features
movie_attributes['overview'] = movie_attributes['overview'].fillna('')
movie_attributes['overview'] = movie_attributes['overview'].apply(lambda x: x if isinstance(x, list) else [str(x)])
movie_attributes['overview'] = movie_attributes['overview'].apply(lambda x: ' '.join(x))

combined_features_ovw = combined_features_ovw + ' ' + movie_attributes['overview']
# Drop rows with NaN values
combined_features_ovw = combined_features_ovw.dropna()
# Convert to lowercase
combined_features_ovw = combined_features_ovw.str.lower()

  .applymap(lambda x: ' '.join(x if isinstance(x, list) else [str(x)]))


In [50]:
combined_features_ovw[0]

"action crime drama mystery thriller getaway films private limited vinod chopra films rajkumar hirani films vidhu vinod chopra productions amitabh bachchan farhan akhtar aditi rao hydari john abraham neil nitin mukesh manav kaul prakash belawadi murali sharma nishigandha wad avtar gill seema pahwa vaidehi parshurami bejoy nambiar abhijeet deshpande vikram chandra abhijat joshi rajkumar hirani vidhu vinod chopra supriya kelkar vidhu vinod chopra suketu mehta vidhu vinod chopra abhijat joshi india hi 'wazir' is a tale of two unlikely friends, a wheelchair-bound chess grandmaster and a brave ats officer. brought together by grief and a strange twist of fate, the two men decide to help each other win the biggest games of their lives. but there's a mysterious, dangerous opponent lurking in the shadows, who is all set to checkmate them."

In [51]:
# Converting Text Data to Feature Vectors
# Converting the combined features into feature vectors
feature_vectors_ovw = vectorizer.fit_transform(combined_features_ovw)

feature_vectors_ovw.shape

(4322, 66974)

In [52]:
# Create the similarity matrix
# Getting the similarity scores using cosine similarity
cosine_sim_ovw = cosine_similarity(feature_vectors_ovw)

recommended_similar_movies_ovw, from_movie_ovw = cb_similar_movies_recommendation(
    similarity_mtrx=cosine_sim_ovw,
    movies=data_movies,
    suggest_n=10
)

Enter your favorite movie name: Wonder womn
The closest match in the database to your favorite movie is: Wonder Woman

We recommend you the following movies:
1. Justice League
2. Batman v Superman: Dawn of Justice
3. 300: Rise of an Empire
4. Ant-Man
5. Darkest Hour
6. Suicide Squad
7. Edge of Tomorrow
8. Mission: Impossible - Rogue Nation
9. Big Eyes
10. Captain America: The Winter Soldier


In [53]:
user_rated_movies_list_print(user=user_id, users_watched_list=movie_user_data)
print('\nWe recommend:')

ensemble_recommendation_meta_level(
    user=user_id,
    model=algo,
    similarity_mtrx=cosine_sim_ovw,
    users_watched_list=movie_user_data,
    movies=data_movies,
    suggest_n=10
)

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

We recommend:


[(275269, 'Wazir', 0.356045570773571),
 (346034, '3 Idiotas', 0.21258961822329492),
 (157336, 'Interstellar', 0.20960563057062434),
 (293299, 'Feast', 0.1972410954335849),
 (177572, 'Big Hero 6', 0.18320183711168878),
 (381044, 'The Bandit', 0.1804403600580155),
 (286192, 'Lava', 0.152760273325752),
 (157832, 'Calvary', 0.14398835064860785),
 (323426, 'Shamitabh', 0.13561517112776328),
 (331392, 'Eloise', 0.13253358129759285)]

In [54]:
result, cb, cf = ensemble_recommendation_intersection_based(
    user=user_id,
    model=algo,
    flattened_URM=flattened_URM,
    similarity_mtrx=cosine_sim_ovw,
    movies=data_movies,
    users_watched_list=movie_user_data,
    suggest_n=10
)

result_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in result]
cb_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in cb]
cf_titles = [data_movies[data_movies['id'] == id]['title'].values[0] for id in cf]

user_rated_movies_list_print(user=user_id, users_watched_list=movie_user_data)
print('\nWe recommend:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(result_titles)))
print("\n")
print("Given that the two recommender systems recommended:")
print('CB:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(cb_titles)))
print('\nCF:')
print("\n".join(f'{i + 1}.\t{title}' for i, title in enumerate(cf_titles)))

User 39 has rated 33 movies:
·	Interstellar	(10)
·	Focus	(10)
·	Heist	(10)
·	Pitch Perfect 2	(10)
·	Kingsman: The Secret Service	(10)
·	The Wedding Ringer	(10)
·	Ronaldo	(10)
·	Southpaw	(9)
·	Mission: Impossible - Rogue Nation	(9)
·	Bridge of Spies	(9)
·	Mad Max: Fury Road	(9)
·	Edge of Tomorrow	(9)
·	The Interview	(9)
·	Horrible Bosses 2	(8)
·	Let's Be Cops	(8)
·	The Loft	(8)
·	Circle	(8)
·	Creed	(8)
·	The Hundred-Foot Journey	(8)
·	Insurgent	(8)
·	Neighbors	(8)
·	Magic in the Moonlight	(8)
·	Captain America: The Winter Soldier	(8)
·	The Little Death	(8)
·	The Drop	(7)
·	Get Hard	(7)
·	A Million Ways to Die in the West	(7)
·	The Hunger Games: Mockingjay - Part 1	(7)
·	RoboCop	(7)
·	Insidious: Chapter 3	(6)
·	The Night Before	(6)
·	Clown	(5)
·	The Gunman	(5)

We recommend:
1.	Dunkirk
2.	PK
3.	Transformers: The Last Knight
4.	Straight Outta Compton
5.	Vice
6.	Winter on Fire: Ukraine's Fight for Freedom
7.	Three Billboards Outside Ebbing, Missouri
8.	Extraction
9.	Pitch Perfect 3
10.	Pad

In [None]:
ensemble_recommender = EnsembleRecommender(algo, flattened_URM, cosine_sim_ovw, data_movies, movie_user_data)

p5 = precision_at_k(
        validation_set,
        ensemble_recommender,
        k=5,
        relevance_threshold=5,
        users_to_monitor=100
    )

print(f"\nPrecision @ 5: {p5:.4f}")

  100 users processed in  0m 47s. Estimated time to completion: 27m  5s
  200 users processed in  1m 35s. Estimated time to completion: 26m 52s
  300 users processed in  2m 18s. Estimated time to completion: 25m 26s
  400 users processed in  3m  4s. Estimated time to completion: 24m 40s
  500 users processed in  3m 43s. Estimated time to completion: 23m 13s
  600 users processed in  4m 30s. Estimated time to completion: 22m 42s
  700 users processed in  5m 13s. Estimated time to completion: 21m 46s
  800 users processed in  5m 52s. Estimated time to completion: 20m 42s
  900 users processed in  6m 28s. Estimated time to completion: 19m 35s
 1000 users processed in  7m  3s. Estimated time to completion: 18m 31s
 1100 users processed in  7m 40s. Estimated time to completion: 17m 36s
 1200 users processed in  8m 15s. Estimated time to completion: 16m 41s
 1300 users processed in  8m 47s. Estimated time to completion: 15m 43s
 1400 users processed in  9m 23s. Estimated time to completion: 

## Get the ground-truth to evaluate the model

### Groundtruth for all movies

In [None]:
# Download ground-truth - the movies that should actually be recommended given a certain movie, according to TMDb
def download_ground_truth(key, users_watched_list):
    import json
    import time
    import os.path

    if not os.path.exists(ground_truth_path):
        wait_time_s = 1
        all_data = {}

        # Get from movie_user_data the id of the movies, and create a set of movie ids, without repetitions
        movie_list = list(set(users_watched_list['id']))

        for i in range(0, len(movie_list)):
            ith_movie = movie_list[i]

            # Set headers
            headers = {"accept": "application/json"}

            # Build URL
            url = f"https://api.themoviedb.org/3/movie/{ith_movie}/recommendations?api_key={key}"

            # Send API Call
            time.sleep(0.01)
            response = requests.get(url, headers)

            if response.status_code == 200:
                # store the Json data in a list:
                all_data[ith_movie] = response.json()
                print(f'Got: {ith_movie}')
            else:
                print(f'Error: {response.status_code}')

            time.sleep(wait_time_s)

        # write the list to file
        with open(ground_truth_path, 'w') as f_out:
            json.dump(all_data, f_out, indent=4)
        print('Done')
    else:
        print('File already exists')

In [None]:
download_ground_truth(api_key, users_watched_list=movie_user_data)

File already exists


In [None]:
ground_truth = pd.read_json(ground_truth_path).T

In [None]:
ground_truth_id = ground_truth.index
# Extract the values from results
ground_truth = pd.json_normalize(ground_truth['results'])
ground_truth['id'] = ground_truth_id

In [None]:
def get_ground_truth(attribute, expected_rec):
    """
    Extracts and compiles the ground truth attribute (e.g., 'id') from a DataFrame of expected recommendations.

    :param str attribute: The attribute to be extracted from each dictionary in the columns (e.g., 'id').
    :param pandas.DataFrame expected_rec: A DataFrame containing expected recommendations, where each column may contain dictionaries with the desired attribute.

    :return: A Series with a compressed list of the extracted attribute (e.g., 'id') for each row.
    """
    # Initialize an empty DataFrame to store the extracted attribute
    expected_rec_attribute = pd.DataFrame()

    # Iterate through each column, except 'id', to extract the desired attribute
    for i in range(0, len(expected_rec.drop('id', axis=1).columns)):
        # Check if the value is a dictionary and extract the attribute (e.g., 'id')
        expected_rec_attribute[i] = expected_rec[i].apply(lambda x: str(x[attribute]) if isinstance(x, dict) else None)

    # Compress the attribute columns into a single column by combining non-null values into lists
    expected_rec_attribute[attribute] = expected_rec_attribute.apply(lambda x: list(x.dropna()), axis=1)

    # Drop all the other columns and keep only the compressed attribute column
    expected_rec_attribute = expected_rec_attribute[attribute]

    # Return the final Series containing the ground truth data
    return expected_rec_attribute

In [None]:
ground_truth_id = get_ground_truth('id', ground_truth)
ground_truth_title = get_ground_truth('title', ground_truth)
ground_truth_release_date = get_ground_truth('release_date', ground_truth)

ground_truth_rec = pd.concat([ground_truth_id, ground_truth_title, ground_truth_release_date], axis=1)
ground_truth_rec['movie_id'] = ground_truth['id']

ground_truth_rec

Unnamed: 0,id,title,release_date,movie_id
0,"[9890, 332210, 839, 340666, 76203, 24428, 1573...","[The Stepford Wives, Storks, Duel, Nocturnal A...","[2004-06-10, 2016-09-22, 1971-11-13, 2016-11-0...",434178
1,"[660360, 264859, 811948, 762441, 1115657, 1211...","[Noryang: Deadly Sea, Six Swedish Girls in Alp...","[2023-12-20, 1983-07-22, 2021-12-01, 2024-06-2...",344066
2,"[413778, 478804, 601493, 773569, 340048, 45679...","[Lost in Paris, The Bold, the Corrupt and the ...","[2017-01-14, 2017-11-24, 2019-09-30, 2021-01-0...",475149
3,"[322125, 143567, 173185, 190880, 21861, 221667...","[A Perfect Man, Just Like Brothers, It Boy, Me...","[2015-03-18, 2012-11-21, 2013-03-06, 2013-11-2...",245775
4,"[542559, 326434, 298821, 10398, 18405, 10734, ...","[Last Summer, Manny Lewis, Fatal Acquittal, Do...","[2018-06-07, 2015-03-12, 2014-08-03, 1999-09-2...",352275
...,...,...,...,...
4317,"[260202, 334538, 263341, 76544, 252178, 424488...","[Turks & Caicos, Into the Forest, Crouching Ti...","[2014-03-20, 2016-06-03, 2016-02-18, 2013-07-0...",286709
4318,"[137321, 44458, 74639, 170657, 67793, 574894, ...","[Winter's Tale, Perhaps Love, Overheard 2, Jou...","[2014-02-13, 2005-12-01, 2011-08-18, 2013-02-0...",262137
4319,"[423093, 431185, 437, 157820, 193726, 449684, ...","[7 años, He's Out There, Cube 2: Hypercube, Mo...","[2016-10-28, 2018-10-25, 2002-04-15, 2013-01-1...",335866
4320,"[46845, 88870, 160118, 74544, 64124, 30301, 31...","[Man in the Wilderness, The Making of 'Psycho'...","[1971-11-24, 2005-10-26, 2013-03-08, 1967-02-0...",311291


### Evaluate the models based on the ground-truth

In [None]:
# Print the results of the CB recommender system for the chosen movie
print(f'For {from_movie}\n\nWe recommend:')

for i in range(len(recommended_similar_movies)):
    print(f"{i + 1}. {recommended_similar_movies[i]}")

For Interstellar

We recommend:
1. Dunkirk
2. Paint It Black
3. The Science of Interstellar
4. Only the Brave
5. Transformers: The Last Knight
6. Batman v Superman: Dawn of Justice
7. The Escort
8. Terminator Genisys
9. Bro, What Happened?
10. A Most Violent Year


In [None]:
movie_id = data_movies[data_movies['title'] == from_movie]['id'].values[0]

# From ground_truth_rec get the row with id as movie_id
truth = ground_truth_rec[ground_truth_rec['movie_id'] == movie_id]

In [None]:
# Extract the truth['title'] values as a NumPy array and flatten it
truth_list = truth['title'].values.flatten()

# Extract only the values inside the list
truth_list = [item for sublist in truth_list for item in sublist]

# Remove the appendices from each title of the truth_list
truth_list = [title.split(' (')[0] for title in truth_list]

truth_list

['Inception',
 'Guardians of the Galaxy',
 'The Martian',
 'Predestination',
 'The Dark Knight',
 'Gone Girl',
 'Back to the Future',
 'Mad Max: Fury Road',
 'Gravity',
 'Big Hero 6',
 'Fight Club',
 'Dunkirk',
 'The Imitation Game',
 'Star Wars',
 'Jurassic World',
 'John Wick',
 'Nightcrawler',
 'Deadpool',
 'The Hunger Games: Mockingjay - Part 1',
 'The Prestige']

In [None]:
# Find how many matches we have between recommended_similar_movies and the elements of the list of truth['title']
matches = [title for title in recommended_similar_movies if title in truth_list]
count = len(matches)

print(f'Our model recommended {count} movies from the ground-truth.')
print(matches)  # With Interstellar we have a match

Our model recommended 1 movies from the ground-truth.
['Dunkirk']
