# Collaborative filtering with user based approach

In [215]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import csv

## Load Data

In [5]:
ratings_df = pd.read_csv('../dataset/ratings.csv')

In [6]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
# user-item matrix
user_item_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')

In [8]:
user_item_matrix.to_csv("user_item_matrix.csv")

In [9]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


## (b) Implement the user-based collaborative filtering approach, using the Pearson correlation function for computing similarities between users

## Person Correlation

$$sim(a, b) = \frac{\sum_{p \in P} (r_{a,p} - \bar{r_a})(r_{b,p} - \bar{r_b})}{\sqrt{\sum_{p \in P}(r_{a,p} - \bar{r_a})^2}\sqrt{\sum_{p \in P}(r_{b,p} - \bar{r_b})^2}}$$

In [10]:
# Pearson correlation
def pearson_correlation(user1_ratings, user2_ratings):
    # Exclude NaN values
    common_movies = user1_ratings.dropna().index.intersection(user2_ratings.dropna().index)
    if len(common_movies) == 0:
        return 0  # No common movies, return 0 correlation
    else:
        user1_common_ratings = user1_ratings[common_movies]
        user2_common_ratings = user2_ratings[common_movies]
        # Compute mean ratings
        mean_user1 = user1_ratings.mean()
        mean_user2 = user2_ratings.mean()
        # Compute numerator and denominators
        numerator = ((user1_common_ratings - mean_user1) * (user2_common_ratings - mean_user2)).sum()
        denominator1 = np.sqrt(((user1_common_ratings - mean_user1)**2).sum())
        denominator2 = np.sqrt(((user2_common_ratings - mean_user2)**2).sum())
        # Handle division by zero
        if denominator1 == 0 or denominator2 == 0:
            return 0
        else:
            return numerator / (denominator1 * denominator2)

In [99]:
def find_similar_users_with_pearson_sim(user_id, user_item_matrix, num_similar_users):
    # User ratings
    user_ratings = user_item_matrix.loc[user_id]
    #Compute the Pearson correlation between the target user and all other users
    correlations = user_item_matrix.apply(lambda row: pearson_correlation(user_ratings, row), axis=1)
    #correlations = correlations.drop(user_id)
    #Select the num_similar_users most similar users 
    similar_users = correlations.nlargest(num_similar_users)

    return similar_users

In [100]:
def find_similar_users_with_ITR_sim(user_id, user_item_matrix, num_similar_users):
    # User ratings
    user_ratings = user_item_matrix.loc[user_id]
    #Compute the Pearson correlation between the target user and all other users
    correlations = user_item_matrix.apply(lambda row: sim_itr(user_ratings, row), axis=1)
    #correlations = correlations.drop(user_id)
    #Select the num_similar_users most similar users 
    similar_users = correlations.nlargest(num_similar_users)

    return similar_users

## Compute pearson correlation with P = set of all users

In [13]:
def compute_user_similarity_with_pearson_correlation_all_users(user_item_matrix):
    # Inizializza la matrice di similarità
    user_similarity = {}

    # Itera su tutti gli utenti
    for user_id_a in user_item_matrix.index:
        user_similarity[user_id_a] = {}
        for user_id_b in user_item_matrix.index:
            # Calcola la similarità tra gli utenti a e b
            user_ratings_1 = user_item_matrix.loc[user_id_a]
            user_ratings_2 = user_item_matrix.loc[user_id_b]
            
            similarity = pearson_correlation(user_ratings_1, user_ratings_2)
            user_similarity[user_id_a][user_id_b] = similarity

    return user_similarity


In [38]:
def compute_user_similarity_with_ITR_all_users(user_item_matrix):
    # Inizializza la matrice di similarità
    user_similarity = {}

    # Itera su tutti gli utenti
    for user_id_a in user_item_matrix.index:
        user_similarity[user_id_a] = {}
        for user_id_b in user_item_matrix.index:
            # Calcola la similarità tra gli utenti a e b
            user_ratings_1 = user_item_matrix.loc[user_id_a]
            user_ratings_2 = user_item_matrix.loc[user_id_b]
            
            similarity = sim_itr(user_ratings_1, user_ratings_2)
            user_similarity[user_id_a][user_id_b] = similarity

    return user_similarity

## (c) Implement the prediction function presented in class for predicting movies scores.

## Prediction function version 1

$$pred(a,p)=\bar{r_a} + \frac{\sum_{b \in N}sim(a,b)*(r_{b,p}-\bar{r_b})}{\sum_{b \in N}sim(a,b)}$$

In [193]:
# Version 1
# most_similar = 0 --> consider all users
# most_similar = 1 --> consider only the most similar users

def predict_rating(user_id, item_id, user_item_matrix,user_similarity,most_similar):
    # Calcola la media dei rating dell'utente a
    mean_rating_a = user_item_matrix.loc[user_id].mean()

    # Inizializza il numeratore e il denominatore della formula
    numerator = 0
    denominator = 0
    dict_intern = user_similarity[user_id]

    if(most_similar == 1):
        #sort values
        dict_intern = {k: v for k, v in sorted(dict_intern.items(), key=lambda item: item[1], reverse=True)}
        users = list(dict_intern.keys())
        users = users[:500]
    else:
        users = dict_intern.keys()
    # Itera su tutti gli utenti
    for other_user_id in users:
        # Verifica se l'utente b ha effettivamente valutato l'item
        if not pd.isnull(user_item_matrix.loc[other_user_id, item_id]):
            # Calcola la media dei rating dell'utente b
            mean_rating_b = user_item_matrix.loc[other_user_id].mean()
            # Calcola il rating di b per l'item p
            rating_b_p = user_item_matrix.loc[other_user_id, item_id]
            # Aggiorna il numeratore e il denominatore
            similarity = dict_intern[other_user_id]
            numerator += similarity * (rating_b_p - mean_rating_b)
            denominator += similarity
    # Calcola il rating predetto
    if denominator != 0:
        predicted_rating = mean_rating_a + (numerator / denominator)
    else:
        predicted_rating = mean_rating_a  # Evita la divisione per zero


    return predicted_rating


## Prediction function version 2

$$pred(a,p)=\bar{r_a} + \frac{\sum_{b \in N}sim(a,b)*(r_{b,p}-\bar{r_b})}{\sum_{b \in N}|sim(a,b)|}$$

In [194]:
# Version 2 (abs at the denominator):
# most_similar = 0 --> consider all users
# most_similar = 1 --> consider only the most similar users

def predict_rating_with_abs(user_id, item_id, user_item_matrix,user_similarity,most_similar):
    # Calcola la media dei rating dell'utente a
    mean_rating_a = user_item_matrix.loc[user_id].mean()

    # Inizializza il numeratore e il denominatore della formula
    numerator = 0
    denominator = 0
    dict_intern = user_similarity[user_id]

    if(most_similar == 1):
        #sort for value
        dict_intern = {k: v for k, v in sorted(dict_intern.items(), key=lambda item: item[1], reverse=True)}
        users = list(dict_intern.keys())
        users = users[:500]
    else:
        users = dict_intern.keys()
    # Itera su tutti gli utenti
    for other_user_id in users:
        # Verifica se l'utente b ha effettivamente valutato l'item
        if not pd.isnull(user_item_matrix.loc[other_user_id, item_id]):
            # Calcola la media dei rating dell'utente b
            mean_rating_b = user_item_matrix.loc[other_user_id].mean()
            # Calcola il rating di b per l'item p
            rating_b_p = user_item_matrix.loc[other_user_id, item_id]
            # Aggiorna il numeratore e il denominatore
            similarity = dict_intern[other_user_id]
            numerator += similarity * (rating_b_p - mean_rating_b)
            denominator += abs(similarity)
            
    # Calcola il rating predetto
    if denominator != 0:
        predicted_rating = mean_rating_a + (numerator / denominator)
    else:
        predicted_rating = mean_rating_a  # Evita la divisione per zero


    return predicted_rating


## Experiments

In [17]:
user_ratings_1 = user_item_matrix.loc[1]
user_ratings_1

movieId
1         4.0
2         NaN
3         4.0
4         NaN
5         NaN
         ... 
193581    NaN
193583    NaN
193585    NaN
193587    NaN
193609    NaN
Name: 1, Length: 9724, dtype: float64

In [18]:
user_ratings_2 = user_item_matrix.loc[2]
user_ratings_2

movieId
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
          ..
193581   NaN
193583   NaN
193585   NaN
193587   NaN
193609   NaN
Name: 2, Length: 9724, dtype: float64

In [19]:
common_movies = user_ratings_1.dropna().index.intersection(user_ratings_2.dropna().index)
common_movies

Index([333, 3578], dtype='int64', name='movieId')

In [20]:
user1_common_ratings = user_ratings_1[common_movies]
user1_common_ratings

movieId
333     5.0
3578    5.0
Name: 1, dtype: float64

In [21]:
user2_common_ratings = user_ratings_2[common_movies]
user2_common_ratings

movieId
333     4.0
3578    4.0
Name: 2, dtype: float64

In [22]:
pearson_correlation(user_ratings_1,user_ratings_2)

0.9999999999999998

## (d) Select a user from the dataset, and for this user, show the 10 most similar users and the 10 most relevant movies that the recommender suggests.

## Calculate the 10 users most similar to a given user

In [148]:
most_similar_users_ps = find_similar_users_with_pearson_sim(1, user_item_matrix,50)
most_similar_users_ps

userId
77     1.000000
1      1.000000
12     1.000000
85     1.000000
253    1.000000
291    1.000000
358    1.000000
388    1.000000
2      1.000000
146    0.999050
278    0.971061
550    0.950065
13     0.947879
127    0.940691
333    0.937923
472    0.929352
157    0.901775
139    0.890342
401    0.871321
511    0.865582
473    0.840747
366    0.835276
258    0.832050
65     0.823571
487    0.822095
90     0.821422
180    0.812324
499    0.800989
430    0.792673
207    0.772252
49     0.752669
535    0.746514
114    0.743985
154    0.712220
162    0.693989
210    0.684283
398    0.665736
505    0.658193
324    0.650791
206    0.637864
478    0.636878
369    0.628122
421    0.613877
375    0.609272
44     0.608688
467    0.601959
297    0.583680
431    0.582863
445    0.581774
72     0.579855
dtype: float64

## Calculate similarity for each pair of users

In [149]:
similarities_pearson_correlation = compute_user_similarity_with_pearson_correlation_all_users(user_item_matrix)

In [166]:
similarities_pearson_correlation[1]

{1: 1.0,
 2: 0.9999999999999998,
 3: 0.011225168459991653,
 4: 0.2104232878167512,
 5: 0.19472139278694267,
 6: -0.3020187823310266,
 7: -0.046641563830548446,
 8: 0.4265612329426459,
 9: 0.35473856827532246,
 10: -0.15121755406386428,
 11: -0.4499572307079907,
 12: 1.0,
 13: 0.9478788458420679,
 14: 0.2253535048415333,
 15: 0.31204846354219923,
 16: 0.049634084292101116,
 17: 0.1984417997849796,
 18: 0.23106109729152335,
 19: 0.22574845026891868,
 20: 0.5256863271011928,
 21: 0.08648178161396773,
 22: -0.1302706228212957,
 23: -0.26068498223507336,
 24: 0.05176459800704183,
 25: 0.018122806444241503,
 26: -0.14770835432617932,
 27: 0.18060517781572202,
 28: 0.009072639414787094,
 29: -0.06324805671604673,
 30: 0.05197519878093253,
 31: 0.0524049522319235,
 32: 0.2700743905723179,
 33: 0.10531918357689701,
 34: 0.1000962747953274,
 35: 0.31319277272972756,
 36: 0.44197925638731167,
 37: -0.4190433919782271,
 38: 0.058071462757481596,
 39: -0.2254567309877416,
 40: -0.3552245041463019,


In [47]:
similarities_itr = compute_user_similarity_with_ITR_all_users(user_item_matrix)

In [167]:
similarities_itr[1]

{1: 0.5,
 2: 0.0,
 3: 8.184766168184817e-09,
 4: 0.029614344910469922,
 5: 0.0,
 6: 0.03929902143869455,
 7: 0.10825568618106429,
 8: 0.0,
 9: 6.759407636620702e-13,
 10: 0.09852714646119755,
 11: 1.5197962624155409e-15,
 12: 0.0,
 13: 0.0,
 14: 6.068150993706834e-16,
 15: 0.003349776618450829,
 16: 9.040048951590336e-10,
 17: 5.196109603903446e-12,
 18: 4.94245712822406e-14,
 19: 0.0,
 20: 0.03954114582368971,
 21: 1.151844575397715e-07,
 22: 0.00029779058240923416,
 23: 5.263967477439779e-05,
 24: 3.163014406386995e-07,
 25: 0.0,
 26: 0.0,
 27: 4.155243264074284e-05,
 28: 1.1629578432479561e-13,
 29: 2.0579943940565604e-14,
 30: 0.0,
 31: 0.0,
 32: 1.1735093299479186e-08,
 33: 0.0087300920440915,
 34: 1.2893800724932063e-08,
 35: 0.0,
 36: 5.365696099239448e-07,
 37: 0.0,
 38: 1.34075122721099e-09,
 39: 1.4671530439017137e-14,
 40: 7.659518113300148e-08,
 41: 0.0025272240346607302,
 42: 9.125401375523729e-10,
 43: 1.4369500220239196e-09,
 44: 1.1555447962727685e-15,
 45: 4.7510511479

## Prediction version 1

In [81]:
predicted_ratings = predict_rating(1,8,user_item_matrix,similarities_pearson_correlation,0)
print(predicted_ratings)

-4.710054039358212


In [121]:
predicted_ratings = predict_rating(1,10,user_item_matrix,similarities_itr,0)
print(predicted_ratings)

4.171956466514737


In [195]:
predicted_ratings = predict_rating(1,190,user_item_matrix,similarities_pearson_correlation,1)
print(predicted_ratings)

3.142883144195417


In [196]:
predicted_ratings = predict_rating(1,8,user_item_matrix,similarities_itr,1)
print(predicted_ratings)

2.843277730534422


## Prediction version 2

In [197]:
predicted_ratings = predict_rating_with_abs(1,8 ,user_item_matrix,similarities_pearson_correlation,0)
print(predicted_ratings)

3.8877407930657384


In [199]:
predicted_ratings = predict_rating_with_abs(1,8 ,user_item_matrix,similarities_itr,0)
print(predicted_ratings)

2.8432777305344223


In [204]:
predicted_ratings = predict_rating_with_abs(20,10,user_item_matrix,similarities_pearson_correlation,1)
print(predicted_ratings)

3.4537068239979325


In [205]:
predicted_ratings = predict_rating_with_abs(20,10,user_item_matrix,similarities_itr,1)
print(predicted_ratings)

3.3791105525316727


## Show the 10 most relevant movies that the recommender suggests

In [139]:
def top_recommended_movies(user_id, user_item_matrix, user_similarity, num_recommendations=10):
    # Inizializza una lista per memorizzare i rating previsti per i film non valutati dall'utente
    predicted_ratings = []

    # Ottieni gli ID dei film che l'utente non ha ancora valutato
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isnull()]

    # Calcola i rating previsti per ogni film non valutato dall'utente
    for item_id in unrated_movies:
        predicted_rating = predict_rating(user_id, item_id, user_item_matrix, user_similarity,0)
        predicted_ratings.append((item_id, predicted_rating))

    # Ordina i rating previsti in ordine decrescente e seleziona i primi 10 film
    top_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]

    return top_recommendations

In [140]:
def top_recommended_movies_abs(user_id, user_item_matrix, user_similarity, num_recommendations=10):
    # Inizializza una lista per memorizzare i rating previsti per i film non valutati dall'utente
    predicted_ratings = []

    # Ottieni gli ID dei film che l'utente non ha ancora valutato
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isnull()]

    # Calcola i rating previsti per ogni film non valutato dall'utente
    for item_id in unrated_movies:
        predicted_rating = predict_rating_with_abs(user_id, item_id, user_item_matrix, user_similarity,0)
        predicted_ratings.append((item_id, predicted_rating))

    # Ordina i rating previsti in ordine decrescente e seleziona i primi 10 film
    top_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:num_recommendations]

    return top_recommendations

In [141]:
user_id = 1
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_pearson_correlation)
print("Top 10 film raccomandati per l'utente", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Film ID: {movie_id}, Rating previsto: {predicted_rating}")


Top 10 film raccomandati per l'utente 1 :
1. Film ID: 2149, Rating previsto: 741.9235272304575
2. Film ID: 112175, Rating previsto: 594.2060362311458
3. Film ID: 7937, Rating previsto: 110.9038139020465
4. Film ID: 1572, Rating previsto: 110.4038139020465
5. Film ID: 7820, Rating previsto: 73.76696876551378
6. Film ID: 2506, Rating previsto: 49.328585517638814
7. Film ID: 93721, Rating previsto: 40.87049968693897
8. Film ID: 167018, Rating previsto: 39.81059426475158
9. Film ID: 494, Rating previsto: 37.15884434387721
10. Film ID: 8405, Rating previsto: 37.130123628981075


In [34]:
user_id = 1
recommendations = top_recommended_movies_abs(user_id, user_item_matrix, similarities_pearson_correlation)
print("Top 10 film raccomandati per l'utente", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Film ID: {movie_id}, Rating previsto: {predicted_rating}")

Top 10 film raccomandati per l'utente 1 :
1. Film ID: 5105, Rating previsto: 7.7909482758620685
2. Film ID: 6967, Rating previsto: 7.7909482758620685
3. Film ID: 7114, Rating previsto: 7.7909482758620685
4. Film ID: 7742, Rating previsto: 7.7909482758620685
5. Film ID: 175475, Rating previsto: 7.571603190941842
6. Film ID: 184641, Rating previsto: 7.571603190941842
7. Film ID: 168712, Rating previsto: 7.462533156498673
8. Film ID: 3604, Rating previsto: 7.3393522833178
9. Film ID: 97024, Rating previsto: 7.144950738916256
10. Film ID: 40491, Rating previsto: 7.120924764890281


In [295]:
user_id = 1
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_pearson_correlation_most_similar)
print("Top 10 movies recommended for user", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating predicted: {predicted_rating}")

Top 10 movies recommended for user 1 :
1. Movie ID: 319, Rating predicted: 6.769157088122605
2. Movie ID: 3567, Rating predicted: 6.726379310344827
3. Movie ID: 555, Rating predicted: 6.641379310344828
4. Movie ID: 913, Rating predicted: 6.252742946708464
5. Movie ID: 55276, Rating predicted: 6.252742946708464
6. Movie ID: 30803, Rating predicted: 6.225754310344827
7. Movie ID: 3972, Rating predicted: 6.223522167487685
8. Movie ID: 27611, Rating predicted: 6.223522167487685
9. Movie ID: 5066, Rating predicted: 6.110281749369218
10. Movie ID: 42728, Rating predicted: 6.110281749369218


In [296]:
user_id = 1
recommendations = top_recommended_movies_abs(user_id, user_item_matrix, similarities_pearson_correlation_most_similar)
print("Top 10 movies recommended for user", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Movie ID: {movie_id}, Rating predicted: {predicted_rating}")

Top 10 movies recommended for user 1 :
1. Movie ID: 319, Rating predicted: 6.769157088122605
2. Movie ID: 3567, Rating predicted: 6.726379310344827
3. Movie ID: 555, Rating predicted: 6.641379310344828
4. Movie ID: 913, Rating predicted: 6.252742946708464
5. Movie ID: 55276, Rating predicted: 6.252742946708464
6. Movie ID: 30803, Rating predicted: 6.225754310344827
7. Movie ID: 3972, Rating predicted: 6.223522167487685
8. Movie ID: 27611, Rating predicted: 6.223522167487685
9. Movie ID: 5066, Rating predicted: 6.110281749369218
10. Movie ID: 42728, Rating predicted: 6.110281749369218


## (e) Design and implement a new similarity function for computing similarities between
## users. Explain why this similarity function is useful for the collaborative filtering approach.
## Hint: Exploiting ideas from related works are highly encouraged.

### In the paper: "Similarity measures for Collaborative Filtering-based Recommender Systems" ITR provides the best result for the 3 evaluation metrics on MovieLens datasets. For instance, ITR gets the best values on MovieLens1M: 0.9428 (RMSE). This is the main reason that led me to choose to implement this metric.

In [46]:
def sim_triangle(user_u_ratings, user_v_ratings, union_items):
    
    if len(union_items) == 0:
        return 0.0
    
    u_ratings = user_u_ratings.loc[union_items]
    v_ratings = user_v_ratings.loc[union_items]
    
    u_ratings = np.nan_to_num(u_ratings, nan=0)
    v_ratings = np.nan_to_num(v_ratings, nan=0)
    
    numerator = np.sqrt(np.sum((u_ratings - v_ratings) ** 2))
    
    denominator = np.sqrt(np.sum(u_ratings ** 2)) + np.sqrt(np.sum(v_ratings ** 2))
    
    similarity = 1 - (numerator / denominator)
    
    return similarity


In [42]:
def sim_urp(mean_rating_u,mean_rating_v,std_dev_u,std_dev_v):
    similarity_urp = 1 - (1 / (1 + np.exp(-np.abs(mean_rating_u - mean_rating_v) * np.abs(std_dev_u - std_dev_v))))
    return similarity_urp

In [41]:
def compute_user_mean_and_std(user_ratings,union_items):
    # Seleziona le valutazioni degli elementi comuni per entrambi gli utenti
    user_ratings_common = user_ratings.loc[union_items]
    
    n = len(user_ratings_common)
    m = len(user_ratings.dropna())

    mean_rating = np.sum(user_ratings.dropna())/n

    std_dev = np.sqrt((np.sum(user_ratings.dropna() - mean_rating))** 2 /m)

    return mean_rating,std_dev


In [40]:
def sim_itr(user_u_ratings, user_v_ratings):
    union_items = np.union1d(user_u_ratings.dropna().index, user_v_ratings.dropna().index)
    
    if len(union_items) == 0:
        return 0.0

    mean_rating_u, std_dev_u = compute_user_mean_and_std(user_u_ratings,union_items)
    mean_rating_v, std_dev_v = compute_user_mean_and_std(user_v_ratings,union_items)

    similarity_triangle = sim_triangle(user_u_ratings, user_v_ratings, union_items)
    similarity_urp = sim_urp(mean_rating_u, mean_rating_v, std_dev_u, std_dev_v)
    
    return similarity_triangle * similarity_urp

In [43]:
def compute_user_similarity_with_ITR_correlation(user_item_matrix):
    # Inizializza la matrice di similarità
    user_similarity = {}

    # Itera su tutti gli utenti
    for user_id_a in user_item_matrix.index:
        user_similarity[user_id_a] = {}
        for user_id_b in user_item_matrix.index:
            # Calcola la similarità tra gli utenti a e b
            user_ratings_1 = user_item_matrix.loc[user_id_a]
            user_ratings_2 = user_item_matrix.loc[user_id_b]
            
            similarity = sim_itr(user_ratings_1, user_ratings_2)
            user_similarity[user_id_a][user_id_b] = similarity

    return user_similarity


In [44]:
def find_similar_users_ITR(user_id, user_item_matrix, num_similar_users=50):
    # User ratings
    user_ratings = user_item_matrix.loc[user_id]
    #Compute the Pearson correlation between the target user and all other users
    correlations = user_item_matrix.apply(lambda row: sim_itr(user_ratings, row), axis=1)
    #correlations = correlations.drop(user_id)
    #Select the num_similar_users most similar users 
    similar_users = correlations.nlargest(num_similar_users)

    return similar_users

In [90]:
similar_users = find_similar_users_ITR(1,user_item_matrix)
print(similar_users)

userId
1      0.500000
135    0.208966
220    0.194920
186    0.165334
282    0.157513
382    0.156432
119    0.155264
522    0.154891
562    0.154257
265    0.152466
280    0.143875
167    0.139392
331    0.138485
63     0.137149
317    0.135808
279    0.135427
266    0.135192
484    0.128951
187    0.128114
304    0.124893
586    0.124677
495    0.121423
367    0.120144
564    0.120109
233    0.119979
246    0.119766
141    0.116622
594    0.111406
7      0.108256
520    0.105845
332    0.104632
290    0.101838
117    0.100048
10     0.098527
607    0.098046
452    0.096388
195    0.089261
432    0.088543
434    0.088266
325    0.087122
169    0.085232
104    0.083525
334    0.079434
330    0.078231
354    0.075143
84     0.071239
132    0.069458
346    0.069072
263    0.067129
230    0.065969
dtype: float64


##

In [100]:
user_id = 1
similarities_ITR_correlation = user_similarity
recommendations = top_recommended_movies(user_id, user_item_matrix, similarities_ITR_correlation)
print("Top 10 film raccomandati per l'utente", user_id, ":")
for idx, (movie_id, predicted_rating) in enumerate(recommendations, start=1):
    print(f"{idx}. Film ID: {movie_id}, Rating previsto: {predicted_rating}")

Top 10 film raccomandati per l'utente 1 :
1. Film ID: 6818, Rating previsto: 7.120924764890281
2. Film ID: 8477, Rating previsto: 7.120924764890281
3. Film ID: 40491, Rating previsto: 7.120924764890281
4. Film ID: 148881, Rating previsto: 7.120922747003081
5. Film ID: 3266, Rating previsto: 7.120099543444331
6. Film ID: 99764, Rating previsto: 7.111497128791824
7. Film ID: 5746, Rating previsto: 6.930481874447391
8. Film ID: 6835, Rating previsto: 6.930481874447391
9. Film ID: 7991, Rating previsto: 6.930477171065139
10. Film ID: 2851, Rating previsto: 6.930381115085179


## Evaluation: Pearson similarity and ITR similarity

In [230]:
rated_items = user_item_matrix.loc[1].dropna()
rated_items

movieId
1       4.0
3       4.0
6       4.0
47      5.0
50      5.0
       ... 
3744    4.0
3793    5.0
3809    4.0
4006    4.0
5060    5.0
Name: 1, Length: 232, dtype: float64

In [232]:
ratings = []
for rating in rated_items:
    ratings.append(rating)
ratings

[4.0,
 4.0,
 4.0,
 5.0,
 5.0,
 3.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 3.0,
 5.0,
 4.0,
 5.0,
 3.0,
 3.0,
 5.0,
 4.0,
 4.0,
 5.0,
 4.0,
 3.0,
 4.0,
 5.0,
 4.0,
 3.0,
 5.0,
 4.0,
 4.0,
 5.0,
 4.0,
 4.0,
 4.0,
 5.0,
 5.0,
 3.0,
 5.0,
 3.0,
 4.0,
 3.0,
 3.0,
 4.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 3.0,
 5.0,
 5.0,
 5.0,
 5.0,
 3.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 5.0,
 4.0,
 2.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 3.0,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.0,
 5.0,
 4.0,
 5.0,
 3.0,
 5.0,
 5.0,
 4.0,
 5.0,
 3.0,
 3.0,
 5.0,
 4.0,
 4.0,
 5.0,
 4.0,
 4.0,
 5.0,
 5.0,
 4.0,
 4.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 3.0,
 5.0,
 4.0,
 4.0,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.0,
 4.0,
 2.0,
 4.0,
 4.0,
 5.0,
 5.0,
 2.0,
 5.0,
 4.0,
 5.0,
 2.0,
 5.0,
 4.0,
 3.0,
 5.0,
 4.0,
 5.0,
 5.0,
 4.0,
 4.0,
 5.0,
 3.0,
 5.0,
 5.0,
 5.0

## Prediction accuracy of Predict_rating without abs and considering all users in the prediction formula 

In [247]:
results = []
i = 0
score_ps = 0
score_itr = 0
for item_id in rated_items.index:
    true_value = ratings[i]
    i += 1
    # Prediction with Pearson similarity
    predicted_rating_pearson = predict_rating(1, item_id, user_item_matrix, similarities_pearson_correlation,0)
    # Prediction with ITR similarity
    predicted_rating_itr = predict_rating(1, item_id, user_item_matrix, similarities_itr,0)
    
    results.append([item_id,true_value, predicted_rating_pearson, predicted_rating_itr])

    diff_pearson = abs(predicted_rating_pearson - true_value)
    diff_itr = abs(predicted_rating_itr - true_value)

    if diff_pearson < diff_itr:
        score_ps += 1
    if diff_itr < diff_pearson:
        score_itr += 1
        
print("Score person similarity: ",score_ps)
print("Score itr similarity: ",score_itr)
print(tabulate(results, headers=["Item ID", "True Value", "Pearson Similarity Prediction", "ITR Similarity Prediction"], tablefmt="grid"))


Score person similarity:  101
Score itr similarity:  130
+-----------+--------------+---------------------------------+-----------------------------+
|   Item ID |   True Value |   Pearson Similarity Prediction |   ITR Similarity Prediction |
|         1 |            4 |                         4.43837 |                     4.56465 |
+-----------+--------------+---------------------------------+-----------------------------+
|         3 |            4 |                         3.35874 |                     4.20467 |
+-----------+--------------+---------------------------------+-----------------------------+
|         6 |            4 |                         4.47425 |                     4.46715 |
+-----------+--------------+---------------------------------+-----------------------------+
|        47 |            5 |                         5.202   |                     4.57481 |
+-----------+--------------+---------------------------------+-----------------------------+
|        50 |

In [258]:
file_name = "predictions.csv"

# Open the file
with open(file_name, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    #header
    writer.writerow(["Item ID", "Pearson Similarity Prediction", "ITR Similarity Prediction"])
    
    # write results
    for row in results:
        writer.writerow(row)

print(f"I risultati sono stati salvati nel file '{file_name}'.")


I risultati sono stati salvati nel file 'predictions.csv'.


## Prediction accuracy of Predict_rating without abs and considering most similar users in the prediction formula (neighborhood)

In [249]:
results = []
i = 0
score_ps = 0
score_itr = 0
for item_id in rated_items.index:
    true_value = ratings[i]
    i += 1
    # Prediction with Pearson similarity
    predicted_rating_pearson = predict_rating(1, item_id, user_item_matrix, similarities_pearson_correlation,1)
    # Prediction with ITR similarity
    predicted_rating_itr = predict_rating(1, item_id, user_item_matrix, similarities_itr,1)
    
    results.append([item_id,true_value, predicted_rating_pearson, predicted_rating_itr])
    
    diff_pearson = abs(predicted_rating_pearson - true_value)
    diff_itr = abs(predicted_rating_itr - true_value)

    if diff_pearson < diff_itr:
        score_ps += 1
    if diff_itr < diff_pearson:
        score_itr += 1

print("Score person similarity: ",score_ps)
print("Score itr similarity: ",score_itr)

print(tabulate(results, headers=["Item ID", "True Value", "Pearson Similarity Prediction", "ITR Similarity Prediction"], tablefmt="grid"))

Score person similarity:  91
Score itr similarity:  140
+-----------+--------------+---------------------------------+-----------------------------+
|   Item ID |   True Value |   Pearson Similarity Prediction |   ITR Similarity Prediction |
|         1 |            4 |                         4.53151 |                     4.56465 |
+-----------+--------------+---------------------------------+-----------------------------+
|         3 |            4 |                         4.11498 |                     4.20467 |
+-----------+--------------+---------------------------------+-----------------------------+
|         6 |            4 |                         4.56297 |                     4.46715 |
+-----------+--------------+---------------------------------+-----------------------------+
|        47 |            5 |                         4.94213 |                     4.57481 |
+-----------+--------------+---------------------------------+-----------------------------+
|        50 | 

In [257]:
file_name = "predictions_most_similar.csv"

# Open the file
with open(file_name, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    #header
    writer.writerow(["Item ID", "Pearson Similarity Prediction", "ITR Similarity Prediction"])
    
    # write results
    for row in results:
        writer.writerow(row)

print(f"I risultati sono stati salvati nel file '{file_name}'.")

I risultati sono stati salvati nel file 'predictions_most_similar.csv'.


## Prediction accuracy of Predict_rating with abs and considering all users in the prediction formula

In [251]:
results_2 = []
i = 0
score_ps = 0
score_itr = 0
for item_id in rated_items.index:
    true_value = ratings[i]
    i += 1
    # Prediction with Pearson similarity
    predicted_rating_pearson = predict_rating_with_abs(1, item_id, user_item_matrix, similarities_pearson_correlation,0)
    # Prediction with ITR similarity
    predicted_rating_itr = predict_rating_with_abs(1, item_id, user_item_matrix, similarities_itr,0)
    
    results_2.append([item_id,true_value, predicted_rating_pearson, predicted_rating_itr])

    diff_pearson = abs(predicted_rating_pearson - true_value)
    diff_itr = abs(predicted_rating_itr - true_value)

    if diff_pearson < diff_itr:
        score_ps += 1
    if diff_itr < diff_pearson:
        score_itr += 1

print("Score person similarity: ",score_ps)
print("Score itr similarity: ",score_itr)
print(tabulate(results_2, headers=["Item ID", "True Value", "Pearson Similarity Prediction", "ITR Similarity Prediction"], tablefmt="grid"))


Score person similarity:  93
Score itr similarity:  138
+-----------+--------------+---------------------------------+-----------------------------+
|   Item ID |   True Value |   Pearson Similarity Prediction |   ITR Similarity Prediction |
|         1 |            4 |                         4.40474 |                     4.56465 |
+-----------+--------------+---------------------------------+-----------------------------+
|         3 |            4 |                         4.07405 |                     4.20467 |
+-----------+--------------+---------------------------------+-----------------------------+
|         6 |            4 |                         4.39643 |                     4.46715 |
+-----------+--------------+---------------------------------+-----------------------------+
|        47 |            5 |                         4.84033 |                     4.57481 |
+-----------+--------------+---------------------------------+-----------------------------+
|        50 | 

In [256]:
file_name = "predictions2.csv"

# Open the file
with open(file_name, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    #header
    writer.writerow(["Item ID", "Pearson Similarity Prediction", "ITR Similarity Prediction"])
    
    # write results
    for row in results_2:
        writer.writerow(row)

print(f"I risultati sono stati salvati nel file '{file_name}'.")


I risultati sono stati salvati nel file 'predictions2.csv'.


## Prediction accuracy of Predict_rating with abs and considering most similar users in the prediction formula (neighborhood)

In [253]:
results_2 = []
i = 0
score_ps = 0
score_itr = 0
for item_id in rated_items.index:
    true_value = ratings[i]
    i += 1
    # Prediction with Pearson similarity
    predicted_rating_pearson = predict_rating_with_abs(1, item_id, user_item_matrix, similarities_pearson_correlation,1)
    # Prediction with ITR similarity
    predicted_rating_itr = predict_rating_with_abs(1, item_id, user_item_matrix, similarities_itr,1)
    
    results_2.append([item_id,true_value, predicted_rating_pearson, predicted_rating_itr])

    diff_pearson = abs(predicted_rating_pearson - true_value)
    diff_itr = abs(predicted_rating_itr - true_value)

    if diff_pearson < diff_itr:
        score_ps += 1
    if diff_itr < diff_pearson:
        score_itr += 1
        
print("Score person similarity: ",score_ps)
print("Score itr similarity: ",score_itr)
print(tabulate(results_2, headers=["Item ID", "True Value", "Pearson Similarity Prediction", "ITR Similarity Prediction"], tablefmt="grid"))


Score person similarity:  89
Score itr similarity:  142
+-----------+--------------+---------------------------------+-----------------------------+
|   Item ID |   True Value |   Pearson Similarity Prediction |   ITR Similarity Prediction |
|         1 |            4 |                         4.51632 |                     4.56465 |
+-----------+--------------+---------------------------------+-----------------------------+
|         3 |            4 |                         4.15499 |                     4.20467 |
+-----------+--------------+---------------------------------+-----------------------------+
|         6 |            4 |                         4.51547 |                     4.46715 |
+-----------+--------------+---------------------------------+-----------------------------+
|        47 |            5 |                         4.87364 |                     4.57481 |
+-----------+--------------+---------------------------------+-----------------------------+
|        50 | 

In [255]:
file_name = "predictions2_most_similar.csv"

# Open the file
with open(file_name, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    #header
    writer.writerow(["Item ID", "Pearson Similarity Prediction", "ITR Similarity Prediction"])
    
    # write results
    for row in results_2:
        writer.writerow(row)

print(f"I risultati sono stati salvati nel file '{file_name}'.")

I risultati sono stati salvati nel file 'predictions2_most_similar.csv'.
