# Collaborative filtering with user based approach

In [1]:
import pandas as pd
import numpy as np

## Load Data

In [2]:
ratings_df = pd.read_csv('../dataset/ratings.csv')

In [3]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
# user-item matrix
user_item_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')

In [5]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


## (b) Implement the user-based collaborative filtering approach, using the Pearson correlation function for computing similarities between users

## Person Correlation

$$sim(a, b) = \frac{\sum_{p \in P} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in P}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in P}(r_{b,p} - \bar{r_b})^2}}$$

In [6]:
from utils.similarity_metrics import sim_itr, pearson_correlation

In [7]:
def find_similar_users_with_pearson_sim(user_id, user_item_matrix, num_similar_users):
    # User ratings
    user_ratings = user_item_matrix.loc[user_id]
    #Compute the Pearson correlation between the target user and all other users
    correlations = user_item_matrix.apply(lambda row: pearson_correlation(user_ratings, row), axis=1)
    #correlations = correlations.drop(user_id)
    #Select the num_similar_users most similar users 
    similar_users = correlations.nlargest(num_similar_users)

    return similar_users

In [8]:
def find_similar_users_with_ITR_sim(user_id, user_item_matrix, num_similar_users):
    # User ratings
    user_ratings = user_item_matrix.loc[user_id]
    #Compute the Pearson correlation between the target user and all other users
    correlations = user_item_matrix.apply(lambda row: sim_itr(user_ratings, row), axis=1)
    #correlations = correlations.drop(user_id)
    #Select the num_similar_users most similar users 
    similar_users = correlations.nlargest(num_similar_users)

    return similar_users

## (c) Implement the prediction function presented in class for predicting movies scores.

## Prediction function version 1

$$pred(a,p)=\bar{r_a} + \frac{\sum_{b \in N}sim(a,b)*(r_{b,p}-\bar{r_b})}{\sum_{b \in N}sim(a,b)}$$

## Prediction function version 2

$$pred(a,p)=\bar{r_a} + \frac{\sum_{b \in N}sim(a,b)*(r_{b,p}-\bar{r_b})}{\sum_{b \in N}|sim(a,b)|}$$

In [9]:
from utils.predict_ratings import predict_rating,predict_rating_with_abs

## (d) Select a user from the dataset, and for this user, show the 10 most similar users and the 10 most relevant movies that the recommender suggests.

## Calculate the 10 users most similar to a given user (we consider user 1)

In [10]:
most_similar_users_ps = find_similar_users_with_pearson_sim(1, user_item_matrix,10)
most_similar_users_ps

userId
77     1.00000
1      1.00000
12     1.00000
85     1.00000
253    1.00000
291    1.00000
358    1.00000
388    1.00000
2      1.00000
146    0.99905
dtype: float64

In [11]:
most_similar_users_itr = find_similar_users_with_ITR_sim(1, user_item_matrix,10)
most_similar_users_itr

userId
1      0.500000
135    0.208966
220    0.194920
186    0.165334
282    0.157513
382    0.156432
119    0.155264
522    0.154891
562    0.154257
265    0.152466
dtype: float64

## Calculate similarity for each pair of users

In [12]:
from utils.similarity_metrics import compute_user_similarity_with_pearson_correlation_all_users,compute_user_similarity_with_ITR_all_users

In [13]:
# PEARSON CORRELATION
similarities_pearson_correlation = compute_user_similarity_with_pearson_correlation_all_users(user_item_matrix)

In [14]:
# ITR CORRELATION
similarities_itr = compute_user_similarity_with_ITR_all_users(user_item_matrix)

## EXAMPLE--> Prediction version 1 (without abs)

In [15]:
# Prediction pearson correlation
predicted_ratings = predict_rating(1,45,user_item_matrix,similarities_pearson_correlation,0)
print(predicted_ratings)

4.307273090618198


In [16]:
# Prediction ITR correlation
predicted_ratings = predict_rating(1,45,user_item_matrix,similarities_itr,0)
print(predicted_ratings)

3.779633924500209


In [17]:
# Prediction pearson correlation based on the most similar users (500)
predicted_ratings = predict_rating(1,45,user_item_matrix,similarities_pearson_correlation,1)
print(predicted_ratings)

4.202119635000858


In [18]:
# Prediction ITR correlation based on the most similar users (500)
predicted_ratings = predict_rating(1,45,user_item_matrix,similarities_itr,1)
print(predicted_ratings)

3.779633924500209


## EXAMPLE--> Prediction version 2 (with abs)

In [19]:
# Prediction pearson correlation
predicted_ratings = predict_rating_with_abs(1,45 ,user_item_matrix,similarities_pearson_correlation,0)
print(predicted_ratings)

4.344632581467062


In [20]:
# Prediction ITR correlation
predicted_ratings = predict_rating_with_abs(1,45 ,user_item_matrix,similarities_itr,0)
print(predicted_ratings)

3.779633924500209


In [21]:
# Prediction pearson correlation based on the most similar users (500)
predicted_ratings = predict_rating_with_abs(1,45,user_item_matrix,similarities_pearson_correlation,1)
print(predicted_ratings)

4.2314844582719235


In [22]:
# Prediction ITR correlation based on the most similar users (500)
predicted_ratings = predict_rating_with_abs(1,45,user_item_matrix,similarities_itr,1)
print(predicted_ratings)

3.779633924500209


## Show the 10 most relevant movies that the recommender system suggests

In [23]:
def top_recommended_movies(user_id, user_item_matrix, user_similarity, num_recommendations, most_similar):
    # Inizializza una lista per memorizzare i rating previsti per i film non valutati dall'utente
    predicted_ratings = []

    # Ottieni gli ID dei film che l'utente non ha ancora valutato
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isnull()]

    # Calcola i rating previsti per ogni film non valutato dall'utente
    for item_id in unrated_movies:
        predicted_rating = predict_rating(user_id, item_id, user_item_matrix, user_similarity,most_similar)
        predicted_ratings.append((item_id, predicted_rating))

    # Ordina i rating previsti in ordine decrescente e seleziona i primi 10 film
    top_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]

    return top_recommendations

In [24]:
def top_recommended_movies_abs(user_id, user_item_matrix, user_similarity, num_recommendations,most_similar):
    # Inizializza una lista per memorizzare i rating previsti per i film non valutati dall'utente
    predicted_ratings = []

    # Ottieni gli ID dei film che l'utente non ha ancora valutato
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isnull()]

    # Calcola i rating previsti per ogni film non valutato dall'utente
    for item_id in unrated_movies:
        predicted_rating = predict_rating_with_abs(user_id, item_id, user_item_matrix, user_similarity,most_similar)
        predicted_ratings.append((item_id, predicted_rating))

    # Ordina i rating previsti in ordine decrescente e seleziona i primi 10 film
    top_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]

    return top_recommendations

In [None]:
user_id = 1

In [25]:
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_pearson_correlation,10,0)
print("Top 10 movie recommended for the user ", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating: {predicted_rating}")


Top 10 movie recommended for the user  1 :
1. Movie ID: 2149, Rating: 741.9235272304575
2. Movie ID: 112175, Rating: 594.2060362311458
3. Movie ID: 7937, Rating: 110.9038139020465
4. Movie ID: 1572, Rating: 110.4038139020465
5. Movie ID: 7820, Rating: 73.76696876551378
6. Movie ID: 2506, Rating: 49.328585517638814
7. Movie ID: 93721, Rating: 40.87049968693897
8. Movie ID: 167018, Rating: 39.81059426475158
9. Movie ID: 494, Rating: 37.15884434387721
10. Movie ID: 8405, Rating: 37.130123628981075


In [26]:
recommendations = top_recommended_movies_abs(user_id, user_item_matrix, similarities_pearson_correlation,10,0)
print("Top 10 movie recommended for the user ", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating: {predicted_rating}")

Top 10 movie recommended for the user  1 :
1. Movie ID: 5105, Rating: 7.7909482758620685
2. Movie ID: 6967, Rating: 7.7909482758620685
3. Movie ID: 7114, Rating: 7.7909482758620685
4. Movie ID: 7742, Rating: 7.7909482758620685
5. Movie ID: 175475, Rating: 7.571603190941842
6. Movie ID: 184641, Rating: 7.571603190941842
7. Movie ID: 168712, Rating: 7.462533156498673
8. Movie ID: 3604, Rating: 7.3393522833178
9. Movie ID: 97024, Rating: 7.144950738916256
10. Movie ID: 40491, Rating: 7.120924764890281


In [27]:
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_pearson_correlation,10,1)
print("Top 10 movies recommended for user", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating: {predicted_rating}")

Top 10 movies recommended for user 1 :
1. Movie ID: 4863, Rating: 21.284233286094768
2. Movie ID: 4466, Rating: 18.31042084194599
3. Movie ID: 302, Rating: 16.911547326967884
4. Movie ID: 1428, Rating: 13.48934049475781
5. Movie ID: 725, Rating: 12.974513706486515
6. Movie ID: 645, Rating: 10.820103131460371
7. Movie ID: 7883, Rating: 10.188643352865974
8. Movie ID: 8482, Rating: 10.188643352865974
9. Movie ID: 8521, Rating: 10.188643352865974
10. Movie ID: 4794, Rating: 9.215199701824469


In [28]:
recommendations = top_recommended_movies_abs(user_id, user_item_matrix, similarities_pearson_correlation,10,1)
print("Top 10 movies recommended for user", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating: {predicted_rating}")

Top 10 movies recommended for user 1 :
1. Movie ID: 4517, Rating: 7.7909482758620685
2. Movie ID: 5105, Rating: 7.7909482758620685
3. Movie ID: 6967, Rating: 7.7909482758620685
4. Movie ID: 7114, Rating: 7.7909482758620685
5. Movie ID: 7742, Rating: 7.7909482758620685
6. Movie ID: 97024, Rating: 7.144950738916256
7. Movie ID: 40491, Rating: 7.120924764890281
8. Movie ID: 5746, Rating: 6.930481874447391
9. Movie ID: 5919, Rating: 6.930481874447391
10. Movie ID: 6835, Rating: 6.930481874447391


## (e) Design and implement a new similarity function for computing similarities between
## users. Explain why this similarity function is useful for the collaborative filtering approach.
## Hint: Exploiting ideas from related works are highly encouraged.

### In the paper: "Similarity measures for Collaborative Filtering-based Recommender Systems" ITR provides the best result on MovieLens datasets. This is the main reason that led me to choose to implement this metric.

##

In [29]:
similarities_ITR_correlation = similarities_itr
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_ITR_correlation,10,0)
print("Top 10 film raccomandati per l'utente", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Film ID: {movie_id}, Rating previsto: {predicted_rating}")

Top 10 film raccomandati per l'utente 1 :
1. Film ID: 6818, Rating previsto: 7.120924764890281
2. Film ID: 8477, Rating previsto: 7.120924764890281
3. Film ID: 40491, Rating previsto: 7.120924764890281
4. Film ID: 148881, Rating previsto: 7.120922747003081
5. Film ID: 3266, Rating previsto: 7.120099543444331
6. Film ID: 99764, Rating previsto: 7.111497128791824
7. Film ID: 5746, Rating previsto: 6.930481874447391
8. Film ID: 6835, Rating previsto: 6.930481874447391
9. Film ID: 7991, Rating previsto: 6.930477171065139
10. Film ID: 2851, Rating previsto: 6.930381115085179


In [30]:
similarities_ITR_correlation = similarities_itr
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_ITR_correlation,10,1)
print("Top 10 film raccomandati per l'utente", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Film ID: {movie_id}, Rating previsto: {predicted_rating}")

Top 10 film raccomandati per l'utente 1 :
1. Film ID: 6818, Rating previsto: 7.120924764890281
2. Film ID: 8477, Rating previsto: 7.120924764890281
3. Film ID: 40491, Rating previsto: 7.120924764890281
4. Film ID: 148881, Rating previsto: 7.120922747003081
5. Film ID: 3266, Rating previsto: 7.120099543444331
6. Film ID: 99764, Rating previsto: 7.111497128791824
7. Film ID: 5746, Rating previsto: 6.930481874447391
8. Film ID: 6835, Rating previsto: 6.930481874447391
9. Film ID: 7991, Rating previsto: 6.930477171065139
10. Film ID: 2851, Rating previsto: 6.930381115085179
