In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# A Movie Recommendation Example

In this example we will try to better understand what movies users will enjoy based on collaborative filtering and increase the satisfaction of our user base by giving a subset of them a free movie coupon code to watch a movie that our model thinks they will enjoy

### Collaborative Filtering Example using SVD

To address some of the limitations of content-based filtering, collaborative filtering uses similarities between users and items simultaneously to provide recommendations. This allows for serendipitous recommendations; that is, collaborative filtering models can recommend an item to user A based on the interests of a similar user B. Furthermore, the embeddings can be learned automatically, without relying on hand-engineering of features.


In [None]:
import pandas as pd
import numpy as np
from random import sample

from surprise import Reader
from surprise import SVD
from surprise import Dataset

In [None]:
# Load datasets
movie_list_df = pd.read_csv("https://storage.googleapis.com/arize-assets/tutorials/fixture_data/movielist-dataset.csv", sep='\t')
movie_ratings_df = pd.read_csv("https://storage.googleapis.com/arize-assets/tutorials/fixture_data/movielens-dataset.csv", sep='\t')
movie_ratings_df.sample(10)

Unnamed: 0,userId,movieId,rating,movie_names,genres,release_year
684524,4090,1957,4,Chariots of Fire (1981),Drama,1981
405981,2436,2463,4,Ruthless People (1986),Comedy,1986
759471,4510,1086,3,Dial M for Murder (1954),Mystery|Thriller,1954
842869,5065,497,4,Much Ado About Nothing (1993),Comedy|Romance,1993
170545,1086,2193,4,Willow (1988),Action|Adventure|Fantasy,1988
835631,5023,3606,3,On the Town (1949),Musical,1949
951888,5749,149,4,Amateur (1994),Crime|Drama|Thriller,1994
349755,2054,761,2,"Phantom, The (1996)",Adventure,1996
568794,3490,319,4,Shallow Grave (1994),Thriller,1994
964221,5812,2916,4,Total Recall (1990),Action|Adventure|Sci-Fi|Thriller,1990


In [None]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
dataset = Dataset.load_from_df(movie_ratings_df[['userId', 'movieId', 'rating']], reader)

In [None]:
# Singular Value Decomposition (SVD)
svd = SVD(n_factors=50)

trainingSet = dataset.build_full_trainset()
svd.fit(trainingSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f5cb65e3d50>

In [None]:
# Create mapping that will be useful later
movie_mapping = dict()

for movieId, movie_names in zip(movie_ratings_df['movieId'], movie_ratings_df['movie_names']):
    movie_mapping[movieId] = movie_names

In [None]:
# Define some useful functions
def build_anti_testset(user_id):
    fill = trainingSet.global_mean
    anti_test_set = list()
    u = trainingSet.to_inner_uid(user_id)

    user_items = set([item_inner_id for (item_inner_id, rating) in trainingSet.ur[u]])
    anti_test_set += [(trainingSet.to_raw_uid(u), trainingSet.to_raw_iid(i), fill) for
                            i in trainingSet.all_items() if i not in user_items]
    return anti_test_set



def rec_top_n_movies(user_id, num_recommendations=10, latest=False):

    recommendation = list()
    testSet = build_anti_testset(user_id)
    predict = svd.test(testSet)

    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))

    recommendation.sort(key=lambda x: x[1], reverse=True)

    movie_names = []
    movie_ratings = []

    for name, ratings in recommendation[:20]:
        movie_names.append(movie_mapping[name])
        movie_ratings.append(ratings)

    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movie_list_df[['movie_names', 'release_year']],
                                            on='movie_names', how='left')

    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommendations)

    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommendations)

In [None]:
# Recommend top 10 latest movies for a user example based on collaborative filtering
user_id = 2341

recommendations= rec_top_n_movies(user_id=user_id, num_recommendations=10, latest=True)
print("Top Movies for User: {}".format(user_id))
print(recommendations)

Top Movies for User: 2341
                                  movie_names    rating
13                           Gladiator (2000)  4.097442
9                          Matrix, The (1999)  4.142182
14                          Braveheart (1995)  4.096634
2            Shawshank Redemption, The (1994)  4.306787
7                        Fugitive, The (1993)  4.170188
16                    Schindler's List (1993)  4.085686
4           Terminator 2: Judgment Day (1991)  4.238146
19           Hunt for Red October, The (1990)  4.053246
15  Indiana Jones and the Last Crusade (1989)  4.085819
5                             Die Hard (1988)  4.237933


In [None]:
# send a free movie coupon code to a subset of users in customer base with their top-rated latest movie
coupon_list = []

# get list of distinct users and choose sample size
sample_size = 1000
allUsers = list(movie_ratings_df['userId'].unique())
luckyUsers = sample(allUsers, sample_size)

for i in luckyUsers:
    recommendations= rec_top_n_movies(user_id=i, num_recommendations=1, latest=True)
    luckyUser = {'userId': str(i), 'movie_names': recommendations["movie_names"].values[0], 'rating': recommendations["rating"].values[0]}
    coupon_list.append(luckyUser)

coupon_df = pd.DataFrame.from_records(coupon_list)
print(coupon_df.head(100))

   userId              movie_names    rating
0    4289  Sixth Sense, The (1999)  4.989783
1    4308       Matrix, The (1999)  4.132553
2     718            Tarzan (1999)  4.510834
3    5184   Iron Giant, The (1999)  5.000000
4    3882        Roger & Me (1989)  4.744297
..    ...                      ...       ...
95   5876   Green Mile, The (1999)  4.324783
96   4888     High Fidelity (2000)  4.899386
97   5733             42 Up (1998)  5.000000
98   5854         Gladiator (2000)  4.586431
99   3629       Matrix, The (1999)  4.636219

[100 rows x 3 columns]


In [None]:
!pip install -q arize
from arize.pandas.logger import Client, Schema
from arize.utils.types import ModelTypes, Environments

SPACE_KEY = "SPACE_KEY"
API_KEY = "API_KEY"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = "collaborative_filtering_movie_example"
model_version = "1.0"
model_type = ModelTypes.NUMERIC

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!")

Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!


In [None]:
import uuid
from datetime import datetime, timedelta

# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)


# OPTIONAL: We can directly specify when inferences were made
def simulate_production_timestamps(X, days=30):
    t = datetime.now()
    current_ts, earlier_ts = t.timestamp(), (t - timedelta(days=days)).timestamp()
    return pd.Series(np.linspace(earlier_ts, current_ts, num=len(X)), index=X.index)

# OPTIONAL: We can directly specify when inferences were made
def simulate_production_timestamps(X, days=30):
    t = datetime.now()
    current_ts, earlier_ts = t.timestamp(), (t - timedelta(days=days)).timestamp()
    return pd.Series(np.linspace(earlier_ts, current_ts, num=len(X)), index=X.index)

In [None]:
# let's simulate some labeled data and assume all of the free coupons were used and users rated the movie they watched
def bound(value):
    return max(1, min(5, value))

actuals = []

for index, row in coupon_df.iterrows():
    rating = row['rating']
    s = np.random.normal(0, 1)
    actual_rating = bound(round(rating + s))
    actuals.append(actual_rating)

actuals = pd.Series(actuals)

In [None]:
# Assemble data for logging
production_dataset = coupon_df.join(
    pd.DataFrame(
        {
            "prediction_id": generate_prediction_ids(coupon_df['userId']),
            "prediction_ts": simulate_production_timestamps(coupon_df['userId']),
            "actual_rating": actuals
        }
    )
)
print(production_dataset.head(100))

   userId              movie_names    rating  \
0    4289  Sixth Sense, The (1999)  4.989783   
1    4308       Matrix, The (1999)  4.132553   
2     718            Tarzan (1999)  4.510834   
3    5184   Iron Giant, The (1999)  5.000000   
4    3882        Roger & Me (1989)  4.744297   
..    ...                      ...       ...   
95   5876   Green Mile, The (1999)  4.324783   
96   4888     High Fidelity (2000)  4.899386   
97   5733             42 Up (1998)  5.000000   
98   5854         Gladiator (2000)  4.586431   
99   3629       Matrix, The (1999)  4.636219   

                           prediction_id  prediction_ts  actual_rating  
0   8c704db6-b4ec-4b11-8d23-ae018e9188f2   1.653711e+09              5  
1   c4cf7cf5-78e3-4875-bd34-63da992b1ecd   1.653713e+09              4  
2   42ad585e-f86d-4cd5-b892-9668bd1199eb   1.653716e+09              3  
3   6e40cbc3-402f-4048-93bc-5b332c82c172   1.653719e+09              4  
4   6b970b43-fd53-4ad7-9f32-c3d59ae3bb71   1.653721e+09   

In [None]:
# log production data Arize
production_schema = Schema(
    prediction_id_column_name="prediction_id",  # REQUIRED
    timestamp_column_name="prediction_ts",
    prediction_score_column_name="rating",
    prediction_label_column_name="rating",
    actual_score_column_name="actual_rating",
    actual_label_column_name= "actual_rating",
    feature_column_names=["userId", "movie_names"],
)



# arize_client.log returns a Response object from Python's requests module
response = arize_client.log(
    dataframe=production_dataset,
    schema=production_schema,
    model_id=model_id,
    model_version=model_version,
    model_type=model_type,
    environment=Environments.PRODUCTION,
)

# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(
        f"❌ logging failed with response code {response.status_code}, {response.text}"
    )
else:
    print(
        f"Step 3 ✅: You have successfully logged {len(production_dataset)} data points to Arize!"
    )

Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjoxNTY=/spaces/U3BhY2U6MTU2/models/modelName/collaborative_filtering_movie_example?selectedTab=dataIngestion
Step 3 ✅: You have successfully logged 1000 data points to Arize!
