## **📥 Environment Setup and Imports**


In [125]:
import requests
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

## **🔍 Load and Explore Data**

In [None]:
# Get the films metadata
films_metadata = pd.read_csv('/content/Films_metadata.csv').drop(columns=['Unnamed: 0'])

In [None]:
# Observe the films metadata
films_metadata.head()

Unnamed: 0,movieID,title,genres,imdb_link,tmdb_link,users_avg_ratings_to_movie
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862/,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844/,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602/,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357/,2.357143
4,5,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862/,3.071429


In [None]:
# Get the Encoded Genres
genres_encoded = pd.read_csv('/content/Genres_encoded.csv').rename(columns={'Unnamed: 0':'movieId' })

In [None]:
# Observe the Encoded Genres
genres_encoded.head()

Unnamed: 0,movieId,Mystery,Action,IMAX,Adventure,Sci-Fi,War,Thriller,Western,Crime,...,Romance,Horror,Documentary,Musical,Comedy,Drama,(no genres listed),Fantasy,Children,Animation
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,2,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
# Get the Links of movies
links_data = pd.read_csv('/content/links.csv')

In [None]:
# Observe the Links of movies
links_data.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
# Get General Movie Data
movies_data = pd.read_csv('/content/movies.csv')

In [None]:
# Observe General Movie Data
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Get User-Movie Interaction Data
ratings_data = pd.read_csv('/content/ratings.csv')

In [None]:
# Observe User-Movie Interaction Data (ratings)
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Get User-Movie Interaction Data (tags)
tags_data = pd.read_csv('/content/tags.csv')

In [None]:
# Observe User-Movie Interaction Data (tags)
tags_data.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
# Observe the total number of movies we have
print('Number of movies we have:', links_data.shape[0])

Number of movies we have: 9742


## 🛠️ Functions

In [None]:
def get_features(movies):

    """
    Fetches various movie metadata features from the OMDb API for each movie in the given DataFrame.

    Args:
        movies (pd.DataFrame): A DataFrame containing at least a column 'imdbId' with IMDb IDs of movies.

    Returns:
        tuple: A tuple containing the following lists in order:
            - movie_id (pd.Series): The IMDb IDs of the movies.
            - years (list): Release years of the movies.
            - writers (list): Writers of the movies.
            - directors (list): Directors of the movies.
            - actors (list): Main actors of the movies.
            - languages (list): Languages of the movies.
            - countries (list): Countries of production.
            - box_office (list): Box office revenue data.
    """


    # Define the OMDb API keys
    api_keys = ["e3bb48da", "27f8364d", "370129a7", "d913a6c1", "f6b351eb", "a887e6bf", "c33453b3", "12c7997", "c056cd17", "5cdfcf53", "888be72f"]
    api_idx = 0
    api_key = api_keys[api_idx]

    # Extract the IMDb IDs from the DataFrame
    movies_id = movies['imdbId']

    # Initialize lists to store fetched movie features
    years, box_office = [], []
    countries, languages = [], []
    directors, actors, writers = [], [], []


    for movie_id in movies_id:

        # Check how many zeroes we need in our url to fetch the movie's data
        req_padding = 7 - len(str(movie_id))

        # Perform a GET request to OMDb API
        url = f"https://www.omdbapi.com/?i=tt{'0'*req_padding}{movie_id}&apikey={api_key}"
        get_response = requests.get(url)


        # If we exceeded the maximum usage for the current API Key
        if get_response.status_code == 401:

              print(f"Alternative API Key Used")

              api_idx += 1
              api_key = api_keys[api_idx]

              # Retry the request with the new API key
              url = f"https://www.omdbapi.com/?i=tt{'0'*req_padding}{movie_id}&apikey={api_key}"
              get_response = requests.get(url)


        # Check if the request was successful
        if get_response.status_code == 200:

            try:
                # Try to access movie data
                data = get_response.json()
                if data.get("Response") == "False":  # API error (e.g., incorrect IMDb ID)

                    print(f"OMDb API Error For Movie {movie_id}: {data.get('Error')}")
                    continue

                # Extract Movie related data from the json response
                years.append(data.get("Year", "Year not found"))

                directors.append(data.get("Director", "Director not found"))
                actors.append(data.get("Actors", "Actors not found"))
                writers.append(data.get("Writer", "Writer not found"))

                languages.append(data.get("Language", "Language not found"))
                countries.append(data.get("Country", "Country not found"))

                box_office.append(data.get("Box Office", "Box Office not found"))



            except requests.exceptions.JSONDecodeError:
                print(f"Error decoding JSON for movie ID: {movie_id}")

        # If we encountered a bad response, then warn me
        else:
              print(f"Error fetching movie {movie_id}: HTTP {get_response.status_code}")


    return movies_id, years, writers, directors, actors, languages, countries, box_office

In [None]:
def fill_missing(missing_ids, api_key, movies_id, years, writers, directors, actors, languages, countries, box_office):

    """
    Fills in missing metadata for specific movies using the OMDb API.

    For each movie ID in `missing_ids`, this function fetches the movie data from the OMDb API
    using the correct IMDb format (`tt0000000`). Then it inserts the retrieved information into
    the appropriate lists at the correct index, maintaining alignment with the main movie dataset.

    Args:
      missing_ids (list[int]): List of IMDb-style movie numeric IDs missing metadata.
      api_key (str): Valid OMDb API key for authentication.
      movies_id (list[int]): Complete list of movie IDs corresponding to dataset order.
      years (list): List to insert retrieved 'Year' values.
      writers (list): List to insert retrieved 'Writer' values.
      directors (list): List to insert retrieved 'Director' values.
      actors (list): List to insert retrieved 'Actors' values.
      languages (list): List to insert retrieved 'Language' values.
      countries (list): List to insert retrieved 'Country' values.
      box_office (list): List to insert retrieved 'Box Office' values.

    Returns:
      None: The passed-in lists are modified in place with missing movie metadata.
    """



    # Iterate over all movie IDs with missing information, i.e Get Data for missing films
    for movie_id in missing_ids:

        # Check how many zeroes we need in our url to fetch the movie's data
        req_padding = 7 - len(str(movie_id))

        # Get the url of the movie's data
        url = f"https://www.omdbapi.com/?i=tt{'0'*req_padding}{movie_id}&apikey={api_key}"
        get_response = requests.get(url)

        data = get_response.json()

        # Get the correct position to insert the missing values
        idx = list(movies_id).index(movie_id)

        # Insert retrieved metadata at the correct index (or placeholder if not found)
        years.insert(idx, data.get("Year", "Year not found"))

        writers.insert(idx, data.get("Writer", "Writer not found"))
        directors.insert(idx, data.get("Director", "Director not found"))
        actors.insert(idx, data.get("Actors", "Actors not found"))

        languages.insert(idx, data.get("Language", "Language not found"))
        countries.insert(idx, data.get("Country", "Country not found"))
        box_office.insert(idx, data.get("Box Office", "Box Office not found"))

In [None]:
def multi_ohe(df, column_name):

    """
    Performs multi-label one-hot encoding on a column with comma-separated string values.

    This is useful for columns where each row may contain multiple categorical labels
    (e.g. "Action, Adventure, Comedy"). The function splits these values, performs one-hot
    encoding, and returns a DataFrame with binary indicators for each unique category.

    Args:
        df (pd.DataFrame): The input DataFrame containing the multi-label column.
        column_name (str): The name of the column to apply one-hot encoding to.

    Returns:
        pd.DataFrame: A DataFrame where each row corresponds to the original row index,
                      and each column is a binary feature for a distinct category in the column.
    """

    # Split values on commas and strip spaces
    exploded = df[column_name].dropna().str.split(',').explode().str.strip()

    # Perform one-hot encoding on the exploded values
    ohe_df = pd.get_dummies(exploded)

    # Restore the original row index to group all values belonging to the same row
    ohe_df[column_name + "_index"] = exploded.index

    # Group by original row index and sum to combine multi-label encodings per row
    result = ohe_df.groupby(column_name + "_index").sum()

    return result

In [None]:
def extract_first_year(year_str):

    """
    Extracts the first year from a string that may contain a year range (e.g. '1989–1990').

    If the input is not in a valid format, returns None.

    Args:
        year_str (str): A string representing a year or year range (e.g. '1994' or '1989–1990').

    Returns:
        int or None: The first year as an integer, or None if parsing fails.
    """

    try:
        # Convert to string, split on the en dash, and take the first part as the starting year
        return int(str(year_str).split('–')[0])

    except:
        # In case of any unexpected format or conversion error
        return None

## 👷‍♀️ Feature Engineering

In [None]:
# Get features associated with each movie
movies_id, years, writers, directors, actors, languages, countries, box_office = get_features(links_data)

Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used
Alternative API Key Used


In [None]:
# Get the total number of movies
print('Number of Movies:', len(movies_id))

# Assertions to ensure alignment of movie-related lists
assert len(movies_id) == len(years), "Mismatch: movies_id and years"
assert len(movies_id) == len(languages), "Mismatch: movies_id and languages"
assert len(movies_id) == len(countries), "Mismatch: movies_id and countries"
assert len(movies_id) == len(box_office),    "Mismatch: movies_id and box_office"
assert len(movies_id) == len(writers),    "Mismatch: movies_id and writers"
assert len(movies_id) == len(directors),    "Mismatch: movies_id and directors"
assert len(movies_id) == len(actors),    "Mismatch: movies_id and actors"

Number of Movies: 9742


In [None]:
# Convert the features to a dataframe
features_df = pd.DataFrame({
                            "imdbId": movies_id,
                            "years": years,
                            "writers": writers,
                            "directors": directors,
                            "actors": actors,
                            "country": countries,
                            "language": languages,
                            "boxOffice": box_office
                          })

In [None]:
# Visualize the dataframe
features_df.head()

Unnamed: 0,imdbId,years,writers,directors,actors,country,language,boxOffice
0,114709,1995,"John Lasseter, Pete Docter, Andrew Stanton",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles",United States,English,Box Office not found
1,113497,1995,"Jonathan Hensleigh, Greg Taylor, Jim Strain",Joe Johnston,"Robin Williams, Kirsten Dunst, Bonnie Hunt",United States,"English, French",Box Office not found
2,113228,1995,Mark Steven Johnson,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret",United States,"English, Italian, German",Box Office not found
3,114885,1995,"Terry McMillan, Ron Bass",Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devine",United States,English,Box Office not found
4,113041,1995,"Albert Hackett, Frances Goodrich, Nancy Meyers",Charles Shyer,"Steve Martin, Diane Keaton, Martin Short",United States,English,Box Office not found


In [None]:
# Save the features dataframe
features_df.to_csv('extracted_features.csv', index=False)

In [None]:
# Drop empty columns
features_df.drop(columns=['boxOffice'], inplace=True)

In [None]:
# Convert years to integer
features_df['years'] = features_df['years'].apply(extract_first_year)

In [None]:
# One Hot Encode Categorical Features
features_df = features_df.reset_index(drop=True)

actors_ohe = multi_ohe(features_df, "actors")
directors_ohe = multi_ohe(features_df, "directors")
writers_ohe = multi_ohe(features_df, "writers")
countries_ohe = multi_ohe(features_df, "country")
languages_ohe = multi_ohe(features_df, "language")

In [None]:
# Normalizing Numerical Features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features_df.drop(columns=['imdbId',	'writers',	'actors', 'directors', 'language', 'country'], inplace=False))
year_df = pd.DataFrame(features_scaled, columns=['year'], index=features_df.index)

In [None]:
year_df

Unnamed: 0,year
0,0.801724
1,0.801724
2,0.801724
3,0.801724
4,0.801724
...,...
9737,0.991379
9738,0.991379
9739,0.991379
9740,1.000000


In [None]:
# Concatenate newly transformed features (OHE + Scaled)
combined_df = pd.concat([
                          features_df['imdbId'].copy(),
                          year_df,
                          actors_ohe,
                          directors_ohe,
                          writers_ohe,
                          countries_ohe,
                          languages_ohe,
                      ], axis=1)

In [None]:
# Visualize the combined features
combined_df

Unnamed: 0,imdbId,year,'Weird Al' Yankovic,50 Cent,7 Year Bitch,A-Trak,A. Belozorovich,A. Michael Baldwin,A.C. Abadie,A.J. Buckley,...,Urdu,Uzbek,Vietnamese,Washoe,Welsh,Wolof,Xhosa,Yiddish,Yoruba,Zulu
0,114709,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,113497,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,113228,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,114885,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,113041,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,5476944,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,5914996,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,6397426,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,8391976,1.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Save the final features dataframe
combined_df.to_csv('ohe_extracted_features.csv', index=False)

In [None]:
# read the saved csv file
combined_df = pd.read_csv('/content/ohe_extracted_features.csv')

In [None]:
# Create the final dataframe, by concatenating the newly extracted features to the original features (genres)
films_features = links_data.merge(combined_df, on="imdbId", how="left")

# Dropping the unnecessary columns: 'imdbId' and 'tmdbId'
films_features = films_features.drop(columns=['imdbId', 'tmdbId'])

In [None]:
films_features

Unnamed: 0,movieId,year,'Weird Al' Yankovic,50 Cent,7 Year Bitch,A-Trak,A. Belozorovich,A. Michael Baldwin,A.C. Abadie,A.J. Buckley,...,Urdu,Uzbek,Vietnamese,Washoe,Welsh,Wolof,Xhosa,Yiddish,Yoruba,Zulu
0,1,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0.801724,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,193583,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,193585,0.991379,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,1.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **📊 Content-Based Filtering**

#### 1- Get the Cosine Similarity Between each 2 pair of Movies (Based on Movie Features Solely)

In [None]:
# Compute cosine similarity only on nonzero values (excluding movie_id)
films_features_exclude_film = films_features.drop(columns=['movieId'], axis=1)

# Convert the DataFrame to a sparse matrix format, storage efficient
films_features_sparse = csr_matrix(films_features_exclude_film)

# Compute cosine similarity on the sparse matrix (between each 2 pair of items)
cos_sim = cosine_similarity(films_features_sparse, films_features_sparse)

In [None]:
# Convert to a DataFrame for readability
cos_sim_df = pd.DataFrame(cos_sim, index=films_features['movieId'], columns=films_features['movieId'])
cos_sim_df.fillna(0, inplace=True)

In [None]:
cos_sim_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.260874,0.274067,0.289488,0.274067,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
2,0.260874,1.000000,0.260874,0.275553,0.260874,0.275553,0.327247,0.260874,0.275553,0.227829,...,0.076839,0.074680,0.084248,0.075898,0.302355,0.077110,0.081289,0.285837,0.159430,0.273528
3,0.274067,0.260874,1.000000,0.289488,0.274067,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
4,0.289488,0.275553,0.289488,1.000000,0.289488,0.305777,0.263453,0.289488,0.305777,0.252820,...,0.085267,0.082871,0.093489,0.084223,0.335520,0.085568,0.090205,0.317190,0.176918,0.303530
5,0.274067,0.260874,0.274067,0.289488,1.000000,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.081010,0.077110,0.081010,0.085568,0.081010,0.085568,0.073724,0.081010,0.085568,0.070748,...,0.310684,0.296371,0.332319,0.297590,0.108412,1.000000,0.314989,0.103788,0.273309,0.082145
193583,0.085400,0.081289,0.085400,0.090205,0.085400,0.090205,0.077720,0.085400,0.090205,0.074583,...,0.327521,0.312432,0.350329,0.419497,0.114288,0.314989,1.000000,0.109412,0.288121,0.086597
193585,0.300293,0.285837,0.300293,0.317190,0.300293,0.317190,0.273286,0.300293,0.317190,0.262256,...,0.103422,0.100517,0.113395,0.102156,0.350971,0.103788,0.109412,1.000000,0.191804,0.314296
193587,0.167493,0.159430,0.167493,0.176918,0.167493,0.176918,0.152430,0.167493,0.176918,0.146277,...,0.284149,0.271072,0.303957,0.272196,0.202131,0.273309,0.288121,0.191804,1.000000,0.174077


In [None]:
# Save the cosine similarity scores for deployment purposes
cos_sim_df.to_csv('movies_cos_sim.csv')

#### 2- Inference (Mimicing the deployment Settings)

In [52]:
def get_user_predictions(user_id, ratings_data, cos_sim_df):

    """
    Predict ratings for all movies that a given user hasn't rated yet using content-based filtering.

    For the given user:
    - Retrieve all rated movies and corresponding ratings.
    - Compute the weighted average rating for each unrated movie using cosine similarity with the user's rated movies.
    - Rescale predicted ratings to the typical rating range (0.5 - 5.0).

    Args:
      user_id (int): The ID of the user for whom we want to generate predictions.
      ratings_data (pd.DataFrame): DataFrame containing user ratings. Must contain 'userId', 'movieId', and 'rating' columns.
      cos_sim_df (pd.DataFrame): Cosine similarity matrix between movies (movieId as both index and columns).

    Returns:
      scaled_predicted_ratings (dict): Dictionary where keys are movieIds the user hasn't rated,
                                       and values are predicted ratings (rescaled to range 0.5–5.0).
    """


    # Get all movies rated by user
    user_rated_movies_df = ratings_data[ratings_data['userId'] == user_id][['movieId', 'rating']]
    user_rated_movies_df.index = user_rated_movies_df['movieId']
    #print("User's Original Ratings:\n", user_rated_movies_df)

    # Find Movies Similar to the user's favourites (remove already user rated movies)
    cos_sim_subset = cos_sim_df.loc[cos_sim_df.index.isin(user_rated_movies_df['movieId'])].drop(columns=user_rated_movies_df['movieId'])
    #print("Cosine Similarity For Candidates:\n", cos_sim_subset)

    # Get possible rating for each non-rated movie
    n_unrated_movies = len(cos_sim_subset.columns)
    predicted_ratings_dict = {}

    for movie in cos_sim_subset.columns:

        # List of movies that user has rated (used as rows in the similarity matrix)
        user_rated_movies = cos_sim_subset.index.tolist()

        numerator = (cos_sim_subset[movie] * user_rated_movies_df.loc[user_rated_movies, 'rating']).sum()     # Weighted sum of similarities * ratings
        denominator = cos_sim_subset[movie].sum()+ 1e-8            # Sum of similarities (add epsilon to avoid division by zero)

        # Predicted rating for the current movie
        predicted_rating = numerator / denominator
        predicted_ratings_dict[movie] = predicted_rating


    # Sort the dictionary by predicted ratings (values) in descending order
    sorted_predicted_ratings = dict(sorted(predicted_ratings_dict.items(), key=lambda item: item[1], reverse=True))
    #print("\n\nThe predicted ratings for all the movies that the user didn't rate yet:\n", sorted_predicted_ratings)

    # Define the (min - max) range for our ratings
    min_rating = 0.5
    max_rating = 5

    min_pred = min(sorted_predicted_ratings.values())
    max_pred = max(sorted_predicted_ratings.values())

    scaled_predicted_ratings = {
                                movie_id: min_rating + (rating - min_pred) * (max_rating - min_rating) / (max_pred - min_pred + 1e-8)
                                for movie_id, rating in sorted_predicted_ratings.items()
                                }

    #print("\n\nThe rescaled predicted ratings for all the movies that the user didn't rate yet:\n", scaled_predicted_ratings)

    return scaled_predicted_ratings

In [None]:
# Read the Cosine similarity dataframe
cos_sim_df = pd.read_csv('movies_cos_sim.csv').set_index('movieId')

In [None]:
# Convert the index and columns back to integers (from strings)
cos_sim_df.columns = cos_sim_df.columns.astype(int)
cos_sim_df.index = cos_sim_df.index.astype(int)

In [53]:
cos_sim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.260874,0.274067,0.289488,0.274067,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
2,0.260874,1.000000,0.260874,0.275553,0.260874,0.275553,0.327247,0.260874,0.275553,0.227829,...,0.076839,0.074680,0.084248,0.075898,0.302355,0.077110,0.081289,0.285837,0.159430,0.273528
3,0.274067,0.260874,1.000000,0.289488,0.274067,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
4,0.289488,0.275553,0.289488,1.000000,0.289488,0.305777,0.263453,0.289488,0.305777,0.252820,...,0.085267,0.082871,0.093489,0.084223,0.335520,0.085568,0.090205,0.317190,0.176918,0.303530
5,0.274067,0.260874,0.274067,0.289488,1.000000,0.289488,0.249419,0.274067,0.289488,0.239352,...,0.080725,0.078456,0.088509,0.079736,0.317646,0.081010,0.085400,0.300293,0.167493,0.287361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.081010,0.077110,0.081010,0.085568,0.081010,0.085568,0.073724,0.081010,0.085568,0.070748,...,0.310684,0.296371,0.332319,0.297590,0.108412,1.000000,0.314989,0.103788,0.273309,0.082145
193583,0.085400,0.081289,0.085400,0.090205,0.085400,0.090205,0.077720,0.085400,0.090205,0.074583,...,0.327521,0.312432,0.350329,0.419497,0.114288,0.314989,1.000000,0.109412,0.288121,0.086597
193585,0.300293,0.285837,0.300293,0.317190,0.300293,0.317190,0.273286,0.300293,0.317190,0.262256,...,0.103422,0.100517,0.113395,0.102156,0.350971,0.103788,0.109412,1.000000,0.191804,0.314296
193587,0.167493,0.159430,0.167493,0.176918,0.167493,0.176918,0.152430,0.167493,0.176918,0.146277,...,0.284149,0.271072,0.303957,0.272196,0.202131,0.273309,0.288121,0.191804,1.000000,0.174077


In [None]:
# Get predicted movies' scores for a specific user
user_id = 200
user_scaled_predicted_ratings = get_user_predictions(user_id, ratings_data, cos_sim_df)

User's Original Ratings:
          movieId  rating
movieId                 
1              1     3.5
5              5     4.0
10            10     4.5
19            19     3.5
34            34     2.5
...          ...     ...
60074      60074     3.5
61024      61024     4.0
61323      61323     4.0
62299      62299     3.5
63433      63433     4.0

[334 rows x 2 columns]
Cosine Similarity For Candidates:
            2         3         4         6         7         8         9       \
movieId                                                                         
1        0.260874  0.274067  0.289488  0.289488  0.249419  0.274067  0.289488   
5        0.260874  0.274067  0.289488  0.289488  0.249419  0.274067  0.289488   
10       0.227829  0.239352  0.252820  0.348485  0.217826  0.239352  0.252820   
19       0.275553  0.289488  0.305777  0.305777  0.263453  0.289488  0.305777   
34       0.237412  0.249419  0.263453  0.263453  0.226988  0.249419  0.263453   
...           ...      

## 🔮 User-Item Predictions


In [54]:
def get_all_users_predictions(ratings_df, cos_sim_df):

    """
    Generate a user-item matrix of predicted ratings using content-based filtering
    and cosine similarity between movies.

    For each user:
    - Already rated (seen) movies are set to 0.
    - Unseen movies are predicted using cosine similarity.

    Args:
      ratings_df (pd.DataFrame): DataFrame containing user ratings. Must include 'userId' and 'movieId' columns.
      cos_sim_df (pd.DataFrame): Cosine similarity DataFrame between movies, indexed and columned by 'movieId'.

    Returns:
      final_prediction_df (pd.DataFrame): DataFrame with users as rows, movies as columns, values as either 0 (seen) or predicted rating (unseen).
    """

    # Get all unique users and movieIds
    unique_users = ratings_df['userId'].unique()
    all_movie_ids = cos_sim_df.columns.tolist()

    user_predictions = {}

    # Get predictions for each user
    for user in unique_users:

        # Movies the user has rated
        rated_movies = ratings_df[ratings_df['userId'] == user]['movieId'].tolist()
        rated_dict = {movie_id: 0 for movie_id in rated_movies}

        # Predicted ratings for unseen movies
        predicted_ratings = get_user_predictions(user, ratings_df, cos_sim_df)

        # Combine both into one dictionary
        full_user_row = {movie_id: predicted_ratings.get(movie_id, rated_dict.get(movie_id, float('nan')))
                         for movie_id in all_movie_ids}

        user_predictions[user] = full_user_row


    # Convert to DataFrame (users as rows, movies as columns)
    final_prediction_df = pd.DataFrame.from_dict(user_predictions, orient='index')

    # Sort by userId and movieId
    final_prediction_df.sort_index(inplace=True)
    final_prediction_df = final_prediction_df[sorted(final_prediction_df.columns)]

    # Save to CSV
    final_prediction_df.to_csv('Content_based_predictions_matrix.csv')

    return final_prediction_df

In [55]:
# Get predictions for all users-movies pairs (excluding seen movies)
final_prediction_df = get_all_users_predictions(ratings_data, cos_sim_df)

In [56]:
# Observe the shape of the final predictions
final_prediction_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,0.000000,1.219871,0.000000,1.261826,1.242525,0.000000,1.539146,1.193359,1.261826,1.558521,...,1.052152,1.050919,1.050522,1.050132,1.249903,1.049372,1.049372,1.248771,1.342651,1.264351
2,3.805510,3.712612,3.837187,3.805510,3.805510,3.844348,3.697028,3.805510,3.805510,3.426570,...,3.689955,3.688391,3.687887,3.687392,4.052942,3.686424,3.686425,3.794751,3.644321,3.807616
3,2.878805,2.715783,2.804403,2.875632,2.933616,2.907951,2.665687,2.933616,2.881303,2.912994,...,3.704271,3.699941,3.698545,3.697173,3.110815,3.694495,3.694495,2.966801,3.315335,2.927160
4,3.503755,3.296314,3.419446,3.667907,3.781866,3.612372,3.617601,3.667907,3.667907,3.324577,...,2.251389,2.264601,2.268862,2.273054,3.676732,2.281238,2.281238,3.618048,3.250817,3.677623
5,0.000000,2.870044,2.681221,2.540873,2.540873,2.812615,2.942619,2.540873,2.540873,2.798069,...,2.937004,2.924092,2.919937,2.915854,2.535971,2.907895,2.907895,2.466601,2.663657,2.541918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.000000,1.012899,1.111137,0.755156,0.742615,0.714083,0.000000,0.753476,0.759791,0.924331,...,1.530858,1.522701,1.520069,1.517478,0.793801,1.512419,1.512419,0.786009,1.214459,0.752516
607,0.000000,2.317297,2.595925,2.366972,2.389057,2.615587,2.238710,2.368535,2.389057,2.460519,...,1.673071,1.686660,1.691039,1.695345,2.380644,1.703749,1.703749,2.414149,2.085101,2.390841
608,0.000000,0.000000,0.000000,0.900619,0.862969,1.159557,1.153222,0.873345,0.883398,0.000000,...,0.886459,0.895853,0.898882,0.901863,0.944026,0.907684,0.907684,0.924454,0.846664,0.880631
609,0.000000,1.575457,1.311270,1.156570,1.156570,1.493366,1.632088,1.090826,1.156570,0.000000,...,1.189298,1.201374,1.205265,1.209091,1.183141,1.216555,1.216555,1.185646,0.987130,1.150901


### **Map the Columns of the movie ID to reflect the movie title**

In [62]:
# Get Movie Id & Title in separate lists
movies_ids = movies_data['movieId'].tolist()
movies_titles = movies_data['title'].tolist()

In [64]:
# Check for any size mismatch
assert len(movies_ids) == len(movies_titles), "Size Mismatch in Name Mapping"

In [66]:
# Get mapped movie IDs
mapped_predictions_df = final_prediction_df.rename(columns={movieId: title for movieId, title in zip(movies_ids, movies_titles)})

In [67]:
# Save mapped dataframe to CSV
mapped_predictions_df.to_csv('Mapped_Content_based_predictions_matrix.csv')

## 📝 Evaluate The Final Content-Based Model

In [80]:
# Get unique users
users = ratings_data['userId'].unique()

# Split users to train & test sets
train_users, test_users = train_test_split(users, train_size=0.8)

# Observe size of train & test sets
print('Length of training data:', len(train_users))
print('Length of test data:', len(test_users))

Length of training data: 488
Length of test data: 122


In [79]:
# Get Rated Movies, for testing
test_dataframe = ratings_data[ratings_data['userId'].isin(test_users)]

In [126]:
def test_model(test_users, test_dataframe, cos_sim_df):

    """
    Predicts ratings for movies already rated by test users using item-item collaborative filtering,
    and compares the predicted ratings to the actual ratings.

    The function evaluates a content-based recommendation model by iterating over each test user,
    and for every movie they have rated, it predicts the rating based on the weighted average
    of ratings they gave to other similar movies. Cosine similarity is used as a measure of similarity
    between movies.

    Args:
        test_users (list): List of user IDs to evaluate the model on.
        test_dataframe (pd.DataFrame): DataFrame containing 'userId', 'movieId', and 'rating' columns.
        cos_sim_df (pd.DataFrame): Item-item cosine similarity matrix with movieId as both index and columns.

    Returns:
        actual_ratings (list): List of true user ratings.
        predicted_ratings (list): List of predicted ratings by the model.
    """


    actual_ratings = []
    predicted_ratings = []


    for user in test_users:

        # Get all movies rated by user
        user_rated_movies_df = test_dataframe[test_dataframe['userId'] == user][['movieId', 'rating']]
        user_rated_movies_df = user_rated_movies_df.set_index('movieId')

        # Get user rated movies
        user_rated_movies = user_rated_movies_df.index.tolist()


        # Get possible rating for each already-rated movie
        for movie in user_rated_movies:

              # Use all other movies rated by user except the target movie to predict its rating
              rated_others = [mid for mid in user_rated_movies if mid != movie]

              # Similarities between the target movie and other rated movies
              similarities = cos_sim_df.loc[rated_others, movie]

              numerator = (similarities * user_rated_movies_df.loc[rated_others, 'rating']).sum()     # Weighted sum of similarities * ratings
              denominator = similarities.sum() + 1e-8            # Sum of similarities (add epsilon to avoid division by zero)

              # Predicted rating for the current movie
              predicted_rating = numerator / denominator

              # Actual rating from the user
              actual_rating = user_rated_movies_df.loc[movie, 'rating']

              # Store the results
              predicted_ratings.append(predicted_rating)
              actual_ratings.append(actual_rating)


    return actual_ratings, predicted_ratings

In [128]:
# Get predictions for the test data
actual_ratings, predicted_ratings = test_model(test_users, test_dataframe, cos_sim_df)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

# Show metrics
print(f"RMSE: {rmse}")

RMSE: 0.8265501910852342
