In [1]:
import numpy as np
import pandas as pd

from preprocessing import get_matrix, get_movie_map
from similarity import pearson_similarity, weighted_pearson_similarity, norm_weights
from predict import recommend_movies

In [2]:
MAX_NEIGHBORS, MAX_RECOMMENDATIONS = 10, 10

In [3]:
# Dataset Preprocessing
ratings, movies = pd.read_csv('./datasets/ratings.csv'), pd.read_csv('./datasets/movies.csv')

matrix, movie_map = get_matrix(ratings, movies), get_movie_map(movies)

In [4]:
input_user = np.random.randint(low=1, high=matrix.shape[0]+1)
other_users = [u for u in matrix.index.tolist() if u != input_user]

print("Input User =", input_user)

Input User = 293


# First Execution $\rightarrow$ "Base" Pearson Similarity

In [5]:
# Similarity Computation
similarities = dict()
for u in other_users:
    similarities[u] = pearson_similarity(matrix, input_user, u)

similarities = {k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)}
similarities = dict(list(similarities.items())[:MAX_NEIGHBORS])

The "pearson_similarity" function computes the similarity between the Input User and the Other Users by exploiting this formula...
$$sim(i,o) = \displaystyle\frac{\sum_{p \in P}[(r_{i,p}-\overline{r}_i)(r_{o,p}-\overline{r}_o)]}{\sqrt{\sum_{p\in P}(r_{i,p}-\overline{r}_i)^2}\cdot\sqrt{\sum_{p \in P}(r_{o,p}-\overline{r}_o)^2}}$$

...where:
- "i": Input User
- "o": Output User
- "P": Set of Movies co-rated by "i" and "o"
- $r_{i,p}$ and $r_{o,p}$: ratings given by the Input User and the Other User to the movie "p"
- $\overline{r}_i$, $\overline{r}_o$: mean value for the ratings (Input User and Other User)

In [6]:
print("Most Similar Users:")
for u, s in similarities.items():
    print(f"User {u} : Similarity = {s:.5f}")

Most Similar Users:
User 8 : Similarity = 1.00000
User 9 : Similarity = 1.00000
User 29 : Similarity = 1.00000
User 36 : Similarity = 1.00000
User 58 : Similarity = 1.00000
User 65 : Similarity = 1.00000
User 80 : Similarity = 1.00000
User 81 : Similarity = 1.00000
User 83 : Similarity = 1.00000
User 97 : Similarity = 1.00000


In [7]:
# Recommendations
recommendations = recommend_movies(matrix, input_user, similarities, max_recommendations=MAX_RECOMMENDATIONS, movie_map=movie_map)

In [8]:
df_recommendations = pd.DataFrame(columns=['Movie', 'Score'])
for movie, score in recommendations.items():
    df_recommendations.loc[len(df_recommendations)] = [movie, score]

display(df_recommendations)

Unnamed: 0,Movie,Score
0,Waking Ned Devine (a.k.a. Waking Ned) (1998),4.485714
1,Never Cry Wolf (1983),4.485714
2,Almost Famous (2000),4.485714
3,"Producers, The (1968)",4.358178
4,Austin Powers in Goldmember (2002),4.358178
5,Adaptation (2002),4.358178
6,Roman Holiday (1953),4.309726
7,Amadeus (1984),4.309726
8,Fantasia 2000 (1999),4.309726
9,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",4.309726


# Second Execution $\rightarrow$ "Weighted" Pearson-Similarity

In [9]:
# Similarities
norm_scores = norm_weights(input_user, matrix)

weighted_similarities = dict()
for u in other_users:
    weighted_similarities[u] = weighted_pearson_similarity(matrix, input_user, u, norm_scores.get(u))

weighted_similarities = {k: v for k, v in sorted(weighted_similarities.items(), key=lambda item: item[1], reverse=True)}
weighted_similarities = dict(list(weighted_similarities.items())[:MAX_NEIGHBORS])

The "pearson_similarity" function computes the similarity between the Input User and the Other Users by exploiting this formula...
$$sim(i,o) = \text{norm\textunderscore score}\cdot\displaystyle\frac{\sum_{p \in P}[(r_{i,p}-\overline{r}_i)(r_{o,p}-\overline{r}_o)]}{\sqrt{\sum_{p\in P}(r_{i,p}-\overline{r}_i)^2}\cdot\sqrt{\sum_{p \in P}(r_{o,p}-\overline{r}_o)^2}}$$

...where:
- "i": Input User
- "o": Other User
- "P": Set of Movies co-rated by "i" and "o"
- $r_{i,p}$ and $r_{o,p}$: ratings given by the Input User and the Other User to the movie "p"
- $\overline{r}_i$, $\overline{r}_o$: mean value for the ratings (Input User and Other User)

The "norm_score" is obtained by considering how many co-ratings exist for the Input User "i" and a given Other User "o".
$$\text{norm\textunderscore score}(i,o) = \displaystyle\frac{coratings\text{\textunderscore }number(i,o)}{\sum_{x\in O}coratings\text{\textunderscore }number(i,x)}$$

In [10]:
print("Most Similar Users:")
for u, s in weighted_similarities.items():
    print(f"User {u} : Similarity = {s:.5f}")

Most Similar Users:
User 608 : Similarity = 0.00649
User 599 : Similarity = 0.00607
User 414 : Similarity = 0.00485
User 307 : Similarity = 0.00451
User 249 : Similarity = 0.00447
User 600 : Similarity = 0.00444
User 68 : Similarity = 0.00416
User 103 : Similarity = 0.00384
User 480 : Similarity = 0.00377
User 610 : Similarity = 0.00375


In [11]:
# Recommendations
weighted_recommendations = recommend_movies(matrix, input_user, weighted_similarities, max_recommendations=MAX_RECOMMENDATIONS, movie_map=movie_map)

In [12]:
df_weighted_recommendations = pd.DataFrame(columns=['Movie', 'Score'])
for movie, score in weighted_recommendations.items():
    df_weighted_recommendations.loc[len(df_weighted_recommendations)] = [movie, score]

display(df_weighted_recommendations)

Unnamed: 0,Movie,Score
0,Yojimbo (1961),4.976998
1,Double Indemnity (1944),4.976998
2,Nashville (1975),4.627567
3,Sullivan's Travels (1941),4.627567
4,Unfaithfully Yours (1948),4.627567
5,"Topo, El (1970)",4.627567
6,"Holy Mountain, The (Montaña sagrada, La) (1973)",4.627567
7,"Cat Returns, The (Neko no ongaeshi) (2002)",4.627567
8,Love and Death (1975),4.497309
9,Key Largo (1948),4.476998
