In [1]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd /content/gdrive/MyDrive/Colab Notebooks/14 - Recommender system

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/14 - Recommender system


# Data Loading

In [5]:
import random
import numpy as np
import pandas as pd
from typing import *
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

In [14]:
def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)
    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']
    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']
    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)
    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    
def load_movies_dataset() -> pd.DataFrame:
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url', 'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'data/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id')
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'data/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp'])
    return ratings_data[['user_id', 'movie_id', 'rating']]

def load_movielens() -> pd.DataFrame:
    ratings_data = load_ratings()
    movies_data = load_movies_dataset()
    ratings_data['user_id'] = ratings_data['user_id'].map(lambda k: f"User {k}")
    ratings_and_movies = ratings_data.set_index('movie_id').join(movies_data['title']).reset_index()
    ratings_and_movies['movie_title'] = ratings_and_movies['title']
    return ratings_and_movies[['user_id', 'movie_title', 'rating']].sample(frac=1) 

In [15]:
movielens_df: pd.DataFrame = load_movielens()
movielens_df.head(5)

Unnamed: 0,user_id,movie_title,rating
95470,User 239,Flirting With Disaster (1996),5
62859,User 887,James and the Giant Peach (1996),4
25884,User 933,"Clockwork Orange, A (1971)",5
36830,User 269,Raising Arizona (1987),5
40568,User 436,"Hunt for Red October, The (1990)",3


In [16]:
# Remove movies with few ratings
movie_ratings = movielens_df.groupby('movie_title').size()
valid_movies = movie_ratings[movie_ratings > 50]
movie_ratings = movielens_df.set_index('movie_title', drop=False).join(valid_movies.to_frame(), how='inner').reset_index(drop=True)
del movie_ratings[0]
movie_ratings = movie_ratings.sample(frac=1)
movie_ratings.head(5)
movielens_df = movie_ratings

# SVD Training with Surprise

In [None]:
pip install surprise

In [19]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

In [22]:
# A reader tells our SVD what the lower and upper bound of our ratings is
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movielens_df, reader)
trainset, testset = train_test_split(data, test_size=.25)

In [23]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movielens_df, reader)
trainset, testset = train_test_split(data, test_size=.01)

In [24]:
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1bddebaf10>

In [25]:
# Normalization
pd.DataFrame(model.qi).iloc[0].pow(2).sum()
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
pd.DataFrame(model.qi).iloc[0].pow(2).sum()

1.0000000000000002

# Evaluation

Surprise SVD stores the product matrix under the `model.qi` attribute.

In [26]:
model.qi.shape

(596, 100)

In [27]:
def display(df: pd.DataFrame):
    item_to_row_idx_df = pd.DataFrame(
        list(item_to_row_idx.items()),
        columns=['Movie name', 'model.qi row idx'],
    ).set_index('Movie name')
    return item_to_row_idx_df.head(5)

In [28]:
item_to_row_idx: Dict[Any, int] = model.trainset._raw2inner_id_items
display(item_to_row_idx)

Unnamed: 0_level_0,model.qi row idx
Movie name,Unnamed: 1_level_1
One Flew Over the Cuckoo's Nest (1975),0
Little Women (1994),1
Psycho (1960),2
"Christmas Carol, A (1938)",3
"Time to Kill, A (1996)",4


In [29]:
toy_story_row_idx : int = item_to_row_idx['Toy Story (1995)']

In [30]:
model.qi[toy_story_row_idx]

array([-0.06274457, -0.14538326,  0.01785439,  0.01504469, -0.09774657,
       -0.12790422, -0.08524039,  0.03636335, -0.03776705, -0.08064767,
        0.03497614,  0.13893852,  0.07759735,  0.02020369, -0.06261343,
       -0.16228053,  0.15669325, -0.09713786,  0.06702559,  0.02062371,
        0.03657148,  0.10487977, -0.04839718,  0.05808868,  0.1583687 ,
        0.1576421 ,  0.10470761,  0.21773035, -0.10556477, -0.08734027,
       -0.24724256, -0.03598854,  0.1254    ,  0.11590666, -0.06103651,
       -0.09511706,  0.00527985, -0.14583881, -0.16645895, -0.04667747,
       -0.01329513,  0.07679354, -0.09954746,  0.12538007,  0.01487529,
       -0.04974565,  0.00196067, -0.02406448, -0.03335389, -0.08242389,
        0.05339436,  0.15309549,  0.13530726, -0.09910058,  0.06634623,
        0.04115311, -0.13431643,  0.00604299,  0.15142587,  0.02153177,
        0.07199174,  0.00956673,  0.06965009,  0.11301199,  0.24408217,
       -0.01073376, -0.03446417, -0.17406774, -0.08087413,  0.04

In [31]:
print(f"Every product has {model.qi[toy_story_row_idx].shape[0]} features")

Every product has 100 features


# Recommendations via Product based CF: Finding similarity between vectors

2 products are "similar" when the cosine distance is close to 0

In [32]:
from scipy.spatial.distance import cosine

def get_vector_by_movie_title(movie_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]

def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

In [33]:
# Fetch the vectors of "Toy Story" and "Wizard of Oz"
toy_story_vec = get_vector_by_movie_title('Toy Story (1995)', model)
wizard_of_oz_vec = get_vector_by_movie_title('Wizard of Oz, The (1939)', model)

# Calculate the distance between the vectors. The smaller the number,
# the more similar the two movies are
similarity_score = cosine_distance(toy_story_vec, wizard_of_oz_vec)
similarity_score

0.8547674495722327

# Recommendations via Matrix Reconstruction

Predict a score between any combination of user and a product

In [34]:
# Refresher: ratings data-frame.
movielens_df.head(2)

Unnamed: 0,user_id,movie_title,rating
7883,User 712,Bedknobs and Broomsticks (1971),4
3301,User 493,Amadeus (1984),4


In [35]:
a_user = "User 196"
a_product = "Toy Story (1995)"
model.predict(a_user, a_product)

Prediction(uid='User 196', iid='Toy Story (1995)', r_ui=None, est=4.087596452419607, details={'was_impossible': False})

# Recommendations via Item Similarity

- Clustering: find clusters of items that are similar (ex. Amazon)
- Item-based collaborative filtering

In [36]:
# Fetch indices for Toy Story and Wizard of Oz
starwars_idx = model.trainset._raw2inner_id_items['Star Wars (1977)']
roj_idx = model.trainset._raw2inner_id_items['Return of the Jedi (1983)']
aladdin_idx = model.trainset._raw2inner_id_items['Aladdin (1992)']

# Get vectors for both movies
starwars_vector = model.qi[starwars_idx]
return_of_jedi_vector = model.qi[roj_idx]
aladdin_vector = model.qi[aladdin_idx]

In [37]:
# Distance between Starwars and Return of the Jedi
cosine_distance(starwars_vector, return_of_jedi_vector)

0.33197445791803326

In [38]:
# Distance between Starwars and Aladdin
cosine_distance(starwars_vector, aladdin_vector)

0.8425335686741092

In [39]:
def display(similarity_table):
    similarity_table = pd.DataFrame(
        similarity_table,
        columns=['vector cosine distance', 'movie title']
    ).sort_values('vector cosine distance', ascending=True)
    return similarity_table.iloc[:4]

# Finding similar movies by ranking

In [40]:
def get_top_similarities(movie_title: str, model: SVD):
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first movie vector
    movie_vector: np.array = get_vector_by_movie_title(movie_title, model)
    similarity_table = []
    
    # Iterate over every possible movie and calculate similarity
    for other_movie_title in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = get_vector_by_movie_title(other_movie_title, model)
        
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, movie_vector)
        similarity_table.append((similarity_score, other_movie_title))
    
    # sort movies by ascending similarity
    return display(sorted(similarity_table))

In [41]:
get_top_similarities('Star Wars (1977)', model)

Unnamed: 0,vector cosine distance,movie title
0,0.0,Star Wars (1977)
1,0.327088,"Empire Strikes Back, The (1980)"
2,0.331974,Return of the Jedi (1983)
3,0.534467,Raiders of the Lost Ark (1981)


In [42]:
get_top_similarities('Pulp Fiction (1994)', model)

Unnamed: 0,vector cosine distance,movie title
0,0.0,Pulp Fiction (1994)
1,0.601109,"Grifters, The (1990)"
2,0.601181,Taxi Driver (1976)
3,0.634143,GoodFellas (1990)
