<a href="https://colab.research.google.com/github/AISaturdaysLagos/cohort7_practicals/blob/main/11__Recommender_Systems/rec_sys_lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to Recommender System
This is a practical introduction to Recommender System and our focus will be building a Matrix Factorization model.

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

### Data
Movielens dataset

In [None]:
ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")

In [None]:
ratings.shape, movies.shape

((100836, 4), (9742, 3))

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
user_one = ratings.loc[ratings['userId'] == 1]

In [None]:
user_one.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
user_one.shape

(232, 4)

In [None]:
def get_user_genres(ratings, movies, user_id):
    '''
        Returns a Counter object showing genres user has watched
    '''
    user_ratings = ratings[ratings['userId'] == user_id]
    genres = []
    for _, row in user_ratings.iterrows():
        movie = movies.loc[movies['movieId'] == row['movieId']]
        for genre in movie.iloc[0]['genres'].split('|'):
            genres.append(genre)
    return Counter(genres)

In [None]:
get_user_genres(ratings, movies, 1).most_common()

[('Action', 90),
 ('Adventure', 85),
 ('Comedy', 83),
 ('Drama', 68),
 ('Thriller', 55),
 ('Fantasy', 47),
 ('Crime', 45),
 ('Children', 42),
 ('Sci-Fi', 40),
 ('Animation', 29),
 ('Romance', 26),
 ('War', 22),
 ('Musical', 22),
 ('Mystery', 18),
 ('Horror', 17),
 ('Western', 7),
 ('Film-Noir', 1)]

In [None]:
get_user_genres(ratings, movies, 5).most_common()

[('Drama', 25),
 ('Comedy', 15),
 ('Crime', 12),
 ('Romance', 11),
 ('Children', 9),
 ('Thriller', 9),
 ('Action', 9),
 ('Adventure', 8),
 ('Fantasy', 7),
 ('Animation', 6),
 ('Musical', 5),
 ('War', 3),
 ('IMAX', 3),
 ('Western', 2),
 ('Sci-Fi', 2),
 ('Mystery', 1),
 ('Horror', 1)]

## Matrix Factorization
<img src="https://developers.google.com/machine-learning/recommendation/images/Matrixfactor.svg">

In [None]:
user_rating_matrix = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
user_rating_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Singular Value Decomposition

In [None]:
_, ratings_test = train_test_split(ratings, stratify=ratings['userId'], test_size=0.01, random_state=42)

print('# interactions on Test set: %d' % len(ratings_test))

# interactions on Test set: 1009


In [None]:
#remove ratings where index is in the test set because we want to predict these ratings
ratings.loc[ratings_test.index, 'rating'] = 0

In [None]:
ratings.iloc[ratings_test.index]

Unnamed: 0,userId,movieId,rating,timestamp
94026,599,5361,0.0,1519345226
16905,105,163925,0.0,1526208027
62778,414,1290,0.0,961514069
82391,522,89745,0.0,1388124779
55931,369,2539,0.0,1237082050
...,...,...,...,...
23948,166,2947,0.0,1189038129
17100,109,122,0.0,841109220
10079,66,1247,0.0,1093143995
10743,68,2338,0.0,1269122622


In [None]:
ratings_train_pivot = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)

In [None]:
ratings_train_pivot.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_ratings_mean = np.mean(ratings_train_pivot.values, axis = 1)
ratings_train_matrix = ratings_train_pivot.values - user_ratings_mean.reshape(-1, 1)


U, sigma, Vt = svds(ratings_train_matrix, k = 15)

In [None]:
U.shape, Vt.shape

((610, 15), (15, 9724))

In [None]:
predicted_ratings = np.dot(np.dot(U, np.diag(sigma)), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
predicted_ratings = pd.DataFrame(predicted_ratings, columns = ratings_train_pivot.columns, index=ratings_train_pivot.index)
predicted_ratings.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.10448,0.961633,0.907879,0.010339,0.225166,1.112192,0.23157,0.028891,0.147899,1.571824,...,-0.002799,-0.001709,-0.003889,-0.003889,-0.002799,-0.003889,-0.002799,-0.002799,-0.002799,0.00483
2,0.217966,-0.03257,-0.011916,0.00466,0.023943,0.04074,0.012383,-0.013506,0.034726,-0.046487,...,0.013271,0.011961,0.014581,0.014581,0.013271,0.014581,0.013271,0.013271,0.013271,0.004103
3,0.054041,0.00793,0.017894,0.007109,-0.005888,0.034701,0.006563,0.011759,0.013164,0.022063,...,0.007966,0.007948,0.007984,0.007984,0.007966,0.007984,0.007966,0.007966,0.007966,0.00784
4,1.823606,0.191886,0.090925,0.046798,0.141628,0.518515,0.239999,-0.053092,-0.035893,-0.195736,...,-0.000913,-0.000568,-0.001258,-0.001258,-0.000913,-0.001258,-0.000913,-0.000913,-0.000913,0.0015
5,1.033936,0.768794,0.318519,0.112752,0.49534,0.641021,0.513965,0.128655,0.050766,0.945401,...,-0.003556,-0.003523,-0.00359,-0.00359,-0.003556,-0.00359,-0.003556,-0.003556,-0.003556,-0.003321
6,2.687415,2.740912,1.47396,0.413415,1.824009,1.741707,2.019106,0.513771,0.348225,3.552369,...,0.042435,0.04204,0.04283,0.04283,0.042435,0.04283,0.042435,0.042435,0.042435,0.039672
7,1.447702,0.678672,0.027718,-0.042532,0.050973,0.762468,0.012826,-0.033807,-0.001919,0.830505,...,-0.009722,-0.009141,-0.010303,-0.010303,-0.009722,-0.010303,-0.009722,-0.009722,-0.009722,-0.005653
8,1.434177,0.973836,0.443296,0.147661,0.66375,0.82922,0.675803,0.16918,0.077602,1.299896,...,-5.4e-05,2e-06,-0.000111,-0.000111,-5.4e-05,-0.000111,-5.4e-05,-5.4e-05,-5.4e-05,0.000342
9,0.207377,0.089351,-0.006241,-0.011827,-0.033083,0.032706,-0.0482,-0.023001,-0.007347,0.034445,...,0.002268,0.002297,0.00224,0.00224,0.002268,0.00224,0.002268,0.002268,0.002268,0.002467
10,0.892218,0.377996,-0.044727,0.021633,0.202763,-0.238078,0.17829,-0.034571,-0.014196,-0.081438,...,0.025996,0.024599,0.027392,0.027392,0.025996,0.027392,0.025996,0.025996,0.025996,0.016219


In [None]:
def get_movies_seen_by_user(user_id):
    return ratings_train[ratings_train['userId'] == user_id]['movieId']

In [None]:
def get_top_recommendations(model, user_id, cutoff_rating):
    user_ratings = model.loc[user_id, model.loc[user_id] >= cutoff_rating]
    index = set(user_ratings.index) - set(get_movies_seen_by_user(user_id))
    index = user_ratings[index].sort_values(ascending=False).index
    
    recommendations = []
    
    for i in index:
        movie = movies.loc[movies['movieId'] == i]
        recommendations.append([movie.iloc[0]['title'], movie.iloc[0]['genres']])
    return recommendations[:10]

In [None]:
get_top_recommendations(predicted_ratings, 1, 2)

[['Terminator 2: Judgment Day (1991)', 'Action|Sci-Fi'],
 ['Sixth Sense, The (1999)', 'Drama|Horror|Mystery'],
 ['Die Hard (1988)', 'Action|Crime|Thriller'],
 ['Aliens (1986)', 'Action|Adventure|Horror|Sci-Fi'],
 ['Godfather, The (1972)', 'Crime|Drama'],
 ['Shawshank Redemption, The (1994)', 'Crime|Drama'],
 ['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Mystery|Sci-Fi|Thriller'],
 ['Blade Runner (1982)', 'Action|Sci-Fi|Thriller'],
 ['Jaws (1975)', 'Action|Horror'],
 ['Star Trek: First Contact (1996)', 'Action|Adventure|Sci-Fi|Thriller']]

## Metric
Average Precision @ k

In [None]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    min_k = min(len(actual), k)
    
    score = 0
    hits = 0
    for i in range(min_k):
        if actual[i] == predicted[i]:
            hits += 1
            score += hits / (i + 1)
    
    if hits == 0:
        return 0
    return score / hits

##### NOTE: ap@k only penalises the recommendation if a bad recommendation appears *BEFORE* a good recommendation

In [None]:
y_true = [10, 21, 33, 41, 5]
y_scores = [10, 21, 33, 41, 5]
apk(y_true, y_scores, 5)

1.0

In [None]:
y_true = [10, 21, 33, 41, 5]
y_scores = [10, 21, 22, 32, 5]
apk(y_true, y_scores, 5)

0.8666666666666667

In [None]:
y_true = [10, 21, 33, 41, 5]
y_scores = [10, 21, 33, 25, 65]
apk(y_true, y_scores, 5)

1.0

See the second recommendation scored 0.87 because 22 and 32 appeared in the list before 5. Meanwhile the subsequent recommendation scored 1.0 because all of its good recommendations [10, 21, 33] appeared before the bad recommendations [25, 65]