In [1]:
import numpy as np
import pandas as pd

from similarity import pearson_similarity, manhattan_similarity
from predict import recommend_movies

In [2]:
# Constants
MAX_NEIGHBORS = 50              # ~ 2*np.sqrt(num_users)
MAX_RECOMMENDATIONS = 100       # ~ np.sqrt(num_movies)

# Preprocessing

In [3]:
ratings, movies = pd.read_csv('./datasets/ratings.csv'), pd.read_csv('./datasets/movies.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
user_ids = ratings['userId'].unique().tolist()
movie_ids = movies['movieId'].unique().tolist()

matrix = pd.DataFrame(index=user_ids, columns=movie_ids, dtype=np.float32)

for i in range(len(ratings)):
    user_id, movie_id, rating = ratings.iloc[i]['userId'], ratings.iloc[i]['movieId'], ratings.iloc[i]['rating']
    matrix.at[user_id, movie_id] = rating

print("Matrix Shape:", matrix.shape)

Matrix Shape: (610, 9742)


In [7]:
matrix.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
movie_map = pd.DataFrame(data=movies['title'].values, index=movies['movieId'].values, columns=['title'])
movie_map.head()

Unnamed: 0,title
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)


In [9]:
input_user = 4
other_users = [u for u in matrix.index.tolist() if u != input_user]

# First Experiment $\rightarrow$ Pearson Similarity

$$\text{pearson-sim}(i,x)= \displaystyle\frac{\sum_{p\in P}[(r_{i,p}-\overline{r}_i)*(r_{x,p}-\overline{r}_x)]}{\sqrt{\sum_{p\in P}(r_{i,p}-\overline{r}_i)^2}*\sqrt{\sum_{p\in P}(r_{x,p}-\overline{r}_x)^2}}$$

In [10]:
p_similarities = dict()

In [11]:
for u in other_users:
    p_similarities[u] = pearson_similarity(matrix, input_user, u)

In [12]:
p_similarities = {k: v for k, v in sorted(p_similarities.items(), key=lambda item: item[1], reverse=True)}
p_similarities = dict(list(p_similarities.items())[:MAX_NEIGHBORS])

print(f"TOP 10 SIMILAR USERS TO USER {input_user}" + "\n")
i = 0
for u, s in p_similarities.items():
    if i == 10:
        break
    else:
        print(f"Similarity between user {input_user} and user {u}: {s:.5f}")
        i += 1

TOP 10 SIMILAR USERS TO USER 4

Similarity between user 4 and user 245: 1.00000
Similarity between user 4 and user 556: 1.00000
Similarity between user 4 and user 107: 1.00000
Similarity between user 4 and user 158: 1.00000
Similarity between user 4 and user 291: 1.00000
Similarity between user 4 and user 431: 1.00000
Similarity between user 4 and user 537: 1.00000
Similarity between user 4 and user 544: 0.99661
Similarity between user 4 and user 396: 0.98722
Similarity between user 4 and user 378: 0.98545


In [13]:
recommendations = recommend_movies(matrix, input_user, p_similarities, max_recommendations=MAX_RECOMMENDATIONS, movie_map=movie_map)
recommendations = {k: v for k, v in sorted(recommendations.items(), key=lambda item: item[1], reverse=True)}

In [14]:
print(f"TOP 25 RECOMMENDED MOVIES TO USER {input_user}" + "\n")

i = 0
for movie, score in recommendations.items():
    if i == 25:
        break
    else:
        print(f"Movie -> {movie} | Score -> {score:.5f}")
        i += 1

TOP 25 RECOMMENDED MOVIES TO USER 4

Movie -> Amityville Horror, The (1979) | Score -> 5.98413
Movie -> Omen, The (1976) | Score -> 5.98413
Movie -> Plan 9 from Outer Space (1959) | Score -> 5.98413
Movie -> Pet Sematary (1989) | Score -> 5.98413
Movie -> Fright Night (1985) | Score -> 5.98413
Movie -> Legend of Drunken Master, The (Jui kuen II) (1994) | Score -> 5.41270
Movie -> Léon: The Professional (a.k.a. The Professional) (Léon) (1994) | Score -> 5.41270
Movie -> Cool Hand Luke (1967) | Score -> 5.41270
Movie -> Naked Gun 2 1/2: The Smell of Fear, The (1991) | Score -> 5.32529
Movie -> For a Few Dollars More (Per qualche dollaro in più) (1965) | Score -> 5.32367
Movie -> Troy (2004) | Score -> 5.27698
Movie -> First Daughter (2004) | Score -> 5.27698
Movie -> Education, An (2009) | Score -> 5.27698
Movie -> Intouchables (2011) | Score -> 5.27698
Movie -> Skyfall (2012) | Score -> 5.27698
Movie -> Spectre (2015) | Score -> 5.27698
Movie -> The Intern (2015) | Score -> 5.27698
Movi

# Second Experiment $\rightarrow$ Manhattan Similarity

$$\text{manhattan-sim}(i,x)= \displaystyle\frac{1}{1+\sum_{p\in P}(|r_{i,p}-r_{x,p}|)}$$

In [15]:
m_similarities = dict()

In [16]:
for u in other_users:
    m_similarities[u] = manhattan_similarity(matrix, input_user, u)

In [17]:
m_similarities = {k: v for k, v in sorted(m_similarities.items(), key=lambda item: item[1], reverse=True)}
m_similarities = dict(list(m_similarities.items())[:MAX_NEIGHBORS])

print(f"TOP 10 SIMILAR USERS TO USER {input_user}" + "\n")
i = 0
for u, s in m_similarities.items():
    if i == 10:
        break
    else:
        print(f"Similarity between user {input_user} and user {u}: {s:.5f}")
        i += 1

TOP 10 SIMILAR USERS TO USER 4

Similarity between user 4 and user 53: 1.00000
Similarity between user 4 and user 92: 1.00000
Similarity between user 4 and user 175: 1.00000
Similarity between user 4 and user 252: 1.00000
Similarity between user 4 and user 320: 1.00000
Similarity between user 4 and user 333: 1.00000
Similarity between user 4 and user 341: 1.00000
Similarity between user 4 and user 506: 1.00000
Similarity between user 4 and user 578: 1.00000
Similarity between user 4 and user 158: 0.66667


In [18]:
recommendations = recommend_movies(matrix, input_user, m_similarities, max_recommendations=MAX_RECOMMENDATIONS, movie_map=movie_map)
recommendations = {k: v for k, v in sorted(recommendations.items(), key=lambda item: item[1], reverse=True)}

In [19]:
print(f"TOP 25 RECOMMENDED MOVIES TO USER {input_user}" + "\n")
i = 0
for movie, score in recommendations.items():
    if i == 25:
        break
    else:
        print(f"Movie -> {movie} | Score -> {score:.5f}")
        i += 1

TOP 25 RECOMMENDED MOVIES TO USER 4

Movie -> Night of the Living Dead (1968) | Score -> 5.98413
Movie -> Amityville Horror, The (1979) | Score -> 5.98413
Movie -> Omen, The (1976) | Score -> 5.98413
Movie -> Scream (1996) | Score -> 5.98413
Movie -> Plan 9 from Outer Space (1959) | Score -> 5.98413
Movie -> Pet Sematary (1989) | Score -> 5.98413
Movie -> Fright Night (1985) | Score -> 5.98413
Movie -> Moonstruck (1987) | Score -> 5.98413
Movie -> Bossa Nova (2000) | Score -> 5.91556
Movie -> True Romance (1993) | Score -> 5.83056
Movie -> One Flew Over the Cuckoo's Nest (1975) | Score -> 5.51942
Movie -> 3-Iron (Bin-jip) (2004) | Score -> 5.41493
Movie -> Léon: The Professional (a.k.a. The Professional) (Léon) (1994) | Score -> 5.41270
Movie -> Legend of Drunken Master, The (Jui kuen II) (1994) | Score -> 5.41270
Movie -> Walk to Remember, A (2002) | Score -> 5.29946
Movie -> Tristan & Isolde (2006) | Score -> 5.29946
Movie -> Citizen Kane (1941) | Score -> 5.29469
Movie -> Adaptation