---
# Setup

In [1]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from time import sleep
import sqlite3
import requests
import json

In [2]:
# small size for testing (100.000 rows)
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production (~100.000.000 rows)
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)

---
# Functions

Implement your recommendation engine logic in `predict_rating()`. Then, call
`full_evaluation()` to evaluate your engine.

| Function | Description |
| --- | --- |
| `get_user_ratings(user_id)` | Returns a DataFrame of all ratings for a given user. |
| `ratings_to_vector(ratings, placeholder=-1)` | Transforms a DataFrame of ratings into a vector. |
| `mask_ratings(ratings, split)` | Mask ratings in a user's rating DataFrame. Works like a training-/test-set split |
| `predict_rating(user_id, movie_id)` | Predict how a user would rate a movie. **IMPLEMENT YOUR ENGINE HERE**|
| `rate_all(user_id)` | Predict ratings of all movies for a given user |
| `get_top_rated(ratings, split)` | Get the top rated movies for a user. |
| `evaluate(masked_ratings, top_ratings)` | Evaluate the accuracy of the predictions. |
| `full_evaluation(user_id, split_mask, split_top)` | Like evaluate() but contains all the steps. |

In [33]:
# get all ratings for a specific user
def get_user_ratings(user_id):
    return netflix_data.filter(pl.col("user") == user_id)

In [202]:
# ratings to vector
def ratings_to_vector(ratings, placeholder=-1):
    """
    Convert a DataFrame of ratings to a vector of ratings

    Args:
        ratings (DataFrame): DataFrame of ratings
        placeholder (int, optional): Placeholder value for missing ratings. Defaults to -1.

    Returns:
        list[float]: List of ratings
    """

    ratings_list = [placeholder] * len(movie_titles)

    # get the ratings as a numpy array
    for current_rating in ratings.rows():

        film = (current_rating[0]) - 1   # film id starts at 1, but list index starts at 0, so subtract 1
        rating = current_rating[2]
        ratings_list[film] = rating

    return ratings_list


In [54]:
# split ratings into masked and unmasked ratings
def mask_ratings(ratings, split):
    """Mask ratings in a user's rating DataFrame

    Args:
        ratings (pl.DataFrame): A User's ratings
        split (float): Percentage of ratings to mask

    Returns:
        [pl.Dataframe, pl.Dataframe]: Masked ratings, unmasked ratings
    """

    # shuffle ratings
    ratings = ratings.sample(fraction=1, shuffle=True)

    # get number of ratings to mask
    n_masked_ratings = int(len(ratings) * split)

    # split ratings into masked and unmasked
    masked_ratings = ratings.head(n_masked_ratings)
    unmasked_ratings = ratings.tail(len(ratings) - n_masked_ratings)

    return masked_ratings, unmasked_ratings

In [123]:
# predict how a user would rate a movie
import random


def predict_rating(user_id, movie_id):
    """Predict how a user would rate a movie
    
    Args:
        user_id (int): User ID
        movie_id (int): Movie ID

    Returns:
        float: Predicted rating
    """

    ### PUT YOUR EVALUATION CODE HERE ###

    return random.uniform(1, 5)

In [118]:
# rate all movies for a given user
def rate_all(user_id):
    """Returns a dataframe with all ratings for a given user

    Args:
        user_id (int): User ID

    Returns:
        pl.DataFrame: Ratings for a given user
    """


    # get all movies
    movies = movie_titles["film"]
    ratings = []

    for movie in movies:
        # predict rating
        rating = predict_rating(user_id, movie)

        # add rating to dataframe
        ratings.append([user_id, movie, rating])

    ratings = pl.DataFrame(ratings, schema=["user", "film", "rating"])

    return ratings

In [119]:
# get the top m% of ratings for a given user
def get_top_rated(ratings, split):
    """Get the top m% of ratings for a given user

    Args:
        ratings (pl.DataFrame): Ratings
        split (float): Percentage of ratings to get

    Returns:
        pl.DataFrame: Top m% of ratings
    """

    # sort ratings by rating
    ratings = ratings.sort(by="rating", descending=True)

    # get number of ratings to get
    n_ratings = int(len(ratings) * split)

    # get top ratings
    top_ratings = ratings.head(n_ratings)

    return top_ratings

In [142]:
# evaluate how well a model performs
# for this, we compare how well the model predicts the top m% of ratings
# we calculate the intersection between the masked ratings and the top m% of ratings
def evaluate(masked_ratings, top_ratings):
    """Evaluate how well a model performs

    Args:
        masked_ratings (pl.DataFrame): Masked ratings
        top_ratings (pl.DataFrame): Top m% of ratings

    Returns:
        float: Percentage of ratings that were correctly predicted
    """

    # get intersection between masked ratings and top ratings
    intersection = masked_ratings.join(top_ratings, on=["user", "film"], how="inner")

    # calculate percentage of ratings that were correctly predicted
    percentage = len(intersection) / len(masked_ratings)

    return percentage

In [140]:
# all steps combined
def full_evaluation(user_id, split_mask, split_top):
    """Evaluate how well a model performs (for a given user)

    Args:
        user_id (int): User ID
        split_mask (float): Percentage of ratings to mask
        split_top (float): Percentage of top ratings to compare with the masked ratings

    Returns:
        float: Percentage of ratings that were correctly predicted
    """

    masked_ratings, unmasked_ratings = mask_ratings(get_user_ratings(user_id), split_mask)

    all_ratings = rate_all(user_id)
    top_ratings = get_top_rated(all_ratings, split_top)

    evaluation =  evaluate(masked_ratings, top_ratings)

    return evaluation

---
# Testing

In [157]:
# find the 5 users with the most ratings
user_rating_counts = netflix_data.groupby("user").count().sort(by="count", descending=True)
user_rating_counts.head(5)

user,count
i64,u32
387418,22
1461435,17
305344,15
2118461,15
2439493,14


In [213]:
# get all ratings for a user
get_user_ratings(7)

film,user,rating,date
i64,i64,i64,str
13580,7,4,"""2005-05-23 """


In [None]:
# transform the ratings into a vector
# here, all missing ratings get the placeholder value 0
ratings_to_vector(get_user_ratings(7), 0)

In [214]:
# evaluate the model for a user
full_evaluation(2118461, 0.5, 0.1)

0.14285714285714285