# A Movie Recommendation Example

In this example we will try to better understand what movies users will enjoy based on collaborative filtering and increase the satisfaction of our user base by giving a subset of them a free movie coupon code to watch a movie that our model thinks they will enjoy

## Collaborative Filtering Example using SVD

To address some of the limitations of content-based filtering, collaborative filtering uses similarities between users and items simultaneously to provide recommendations. This allows for serendipitous recommendations; that is, collaborative filtering models can recommend an item to user A based on the interests of a similar user B. Furthermore, the embeddings can be learned automatically, without relying on hand-engineering of features.


In [None]:
!pip install surprise

import pandas as pd
import numpy as np
from random import sample

from surprise import Reader
from surprise import SVD
from surprise import Dataset

In [None]:
# Load datasets
movie_list_df = pd.read_csv(
    "https://storage.googleapis.com/arize-assets/tutorials/fixture_data/movielist-dataset.csv",
    sep="\t",
)
movie_ratings_df = pd.read_csv(
    "https://storage.googleapis.com/arize-assets/tutorials/fixture_data/movielens-dataset.csv",
    sep="\t",
)
movie_ratings_df.sample(10)

In [None]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
dataset = Dataset.load_from_df(
    movie_ratings_df[["userId", "movieId", "rating"]], reader
)

### Train/Build Model
Train a basic Matrix Factorization model as our Collaborative Filter 

In [None]:
# Singular Value Decomposition (SVD)
svd = SVD(n_factors=50)

trainingSet = dataset.build_full_trainset()
svd.fit(trainingSet)

In [None]:
# Create mapping that will be useful later
movie_mapping = dict()

for movieId, movie_names in zip(
    movie_ratings_df["movieId"], movie_ratings_df["movie_names"]
):
    movie_mapping[movieId] = movie_names

In [None]:
# Define some useful functions

# This function builds a test dataset using movies users have already rated so we have an actual label to work with
def build_anti_testset(user_id):
    fill = trainingSet.global_mean
    anti_test_set = list()
    u = trainingSet.to_inner_uid(user_id)

    user_items = set(
        [item_inner_id for (item_inner_id, rating) in trainingSet.ur[u]]
    )
    anti_test_set += [
        (trainingSet.to_raw_uid(u), trainingSet.to_raw_iid(i), fill)
        for i in trainingSet.all_items()
        if i not in user_items
    ]
    return anti_test_set


# This function returns the top n recommendations for a specific user
def rec_top_n_movies(user_id, num_recommendations=10, latest=False):
    recommendation = list()
    testSet = build_anti_testset(user_id)
    predict = svd.test(testSet)

    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))

    recommendation.sort(key=lambda x: x[1], reverse=True)

    movie_names = []
    movie_ratings = []

    for name, ratings in recommendation[:20]:
        movie_names.append(movie_mapping[name])
        movie_ratings.append(ratings)

    movie_dataframe = pd.DataFrame(
        {"movie_names": movie_names, "rating": movie_ratings}
    ).merge(
        movie_list_df[["movie_names", "release_year"]],
        on="movie_names",
        how="left",
    )

    if latest == True:
        return movie_dataframe.sort_values("release_year", ascending=False)[
            ["movie_names", "rating"]
        ].head(num_recommendations)

    else:
        return movie_dataframe.drop("release_year", axis=1).head(
            num_recommendations
        )

In [None]:
# Recommend top 10 latest movies for a user example based on collaborative filtering
user_id = 2341

recommendations = rec_top_n_movies(
    user_id=user_id, num_recommendations=10, latest=True
)
print("Top Movies for User: {}".format(user_id))
print(recommendations)

In [None]:
# send a free movie coupon code to a subset of users in customer base with their top-rated latest movie
coupon_list = []

# get list of distinct users and choose sample size
sample_size = 100
allUsers = list(movie_ratings_df["userId"].unique())
luckyUsers = sample(allUsers, sample_size)

# make recommendations for top 5 movies for each user
for i in luckyUsers:
    recommendations = rec_top_n_movies(
        user_id=i, num_recommendations=5, latest=True
    ).sort_values("rating", ascending=False)
    recommendations["userId"] = i
    recommendations["rank"] = [1, 2, 3, 4, 5]
    coupon_list.append(recommendations)

coupon_df = pd.concat(coupon_list)
# to rank my rating, we scale rating between 0-1
coupon_df["score"] = coupon_df["rating"] / 5

In [None]:
coupon_df.head()

### Prepare Data for Arize

In [None]:
import uuid
from datetime import datetime, timedelta


# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)


# OPTIONAL: We can directly specify when inferences were made
def simulate_production_timestamps(X, days=30):
    t = datetime.now()
    current_ts, earlier_ts = (
        t.timestamp(),
        (t - timedelta(days=days)).timestamp(),
    )
    return pd.Series(
        np.linspace(earlier_ts, current_ts, num=len(X)), index=X.index
    )

In [None]:
# let's simulate some labeled data and assume all of the free coupons were used and users rated the movie they watched
def bound(value):
    return max(1, min(5, value))


actuals = []

for index, row in coupon_df.iterrows():
    rating = row["rating"]
    s = np.random.normal(0, 1)
    actual_rating = bound(round(rating + s))
    actuals.append(actual_rating)

actuals = pd.Series(actuals)

In [None]:
# Assemble data for logging
coupon_df["prediction_id"] = generate_prediction_ids(coupon_df["userId"])
coupon_df["prediction_ts"] = simulate_production_timestamps(coupon_df["userId"])
coupon_df["actual_rating"] = actuals
coupon_df["watched"] = np.random.choice([0, 1], size=len(coupon_df))
coupon_df["watched"] = coupon_df["watched"].astype(str)
coupon_df.reset_index(inplace=True, drop=True)
coupon_df.head()

### Install and Initialize Arize Client

In [None]:
!pip install -q arize
from arize.pandas.logger import Client, Schema
from arize.utils.types import ModelTypes, Environments, Metrics

SPACE_KEY = "SPACE_KEY"
API_KEY = "API_KEY"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print(
        "Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!"
    )

### Log Data to Arize

In [None]:
# log production data Arize
schema = Schema(
    prediction_id_column_name="prediction_id",
    timestamp_column_name="prediction_ts",
    prediction_group_id_column_name="userId",
    rank_column_name="rank",
    relevance_score_column_name="rating",
    prediction_score_column_name="score",
    relevance_labels_column_name="watched",
    tag_column_names=["movie_names"],
)

# arize_client.log returns a Response object from Python's requests module
response = arize_client.log(
    dataframe=coupon_df,
    model_id="collaborative_filtering_movie_example",
    model_version="1.0",
    model_type=ModelTypes.RANKING,
    metrics_validation=[Metrics.RANKING],
    validate=True,
    environment=Environments.PRODUCTION,
    schema=schema,
)


# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(
        f"❌ logging failed with response code {response.status_code}, {response.text}"
    )
else:
    print(
        f"Step 3 ✅: You have successfully logged {len(coupon_df)} data points to Arize!"
    )