In [1]:
import pandas as pd
import numpy as np
import concurrent.futures
import itertools
from collections import defaultdict
from functools import partial



# Assignment 1

In [2]:
folder = "./ml-latest-small/"

In [3]:
rating_source = pd.read_csv(folder + "ratings.csv", dtype='object')
rating_source

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
rating_source[:100]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
95,1,1445,3.0,964984112
96,1,1473,4.0,964980875
97,1,1500,4.0,964980985
98,1,1517,5.0,964981107


In [5]:
user_ratings_dict = {}
for userId, group in rating_source.groupby('userId'):
    user_ratings_dict[userId] = list(zip(group['movieId'], group['rating']))

### Definition of User Similarity


In [6]:
def get_common_movies_ratings(user1, user2, user_ratings_dict):
    user1_ratings = dict(user_ratings_dict[user1])
    user2_ratings = dict(user_ratings_dict[user2])

    common_movies = set(user1_ratings.keys()) & set(user2_ratings.keys())

    common_movies_ratings = {}
    for movie_id in common_movies:
        rating_user1 = user1_ratings.get(movie_id)
        rating_user2 = user2_ratings.get(movie_id)
        if rating_user1 is not None and rating_user2 is not None:
            common_movies_ratings[movie_id] = (rating_user1, rating_user2)    

    return common_movies_ratings

In [7]:
def get_ratings_for_user(user, user_ratings_dict):
    ratings_for_user = []

    movie_ratings_for_user = user_ratings_dict[user]

    for _, rating in movie_ratings_for_user:
        ratings_for_user.append(float(rating))
    
    ratings_for_user = np.array(ratings_for_user)
    return ratings_for_user

In [8]:
def user_similarity(user_a, user_b, user_ratings_dict):
    movie_intersection_a_b = get_common_movies_ratings(user_a, user_b, user_ratings_dict)
    if(len(movie_intersection_a_b)==0):
        return 0

    ratings_user_a, ratings_user_b = zip(*[(float(ra), float(rb)) for ra, rb in movie_intersection_a_b.values()])
    mean_rating_a = np.mean(get_ratings_for_user(user_a, user_ratings_dict))
    mean_rating_b = np.mean(get_ratings_for_user(user_b, user_ratings_dict))

    numerator = np.sum((np.array(ratings_user_a) - mean_rating_a) * (np.array(ratings_user_b) - mean_rating_b))
    denominator_a = np.sqrt(np.sum((np.array(ratings_user_a) - mean_rating_a)**2))
    denominator_b = np.sqrt(np.sum((np.array(ratings_user_b) - mean_rating_b)**2))

    similarity = numerator / (denominator_a * denominator_b) if denominator_a * denominator_b != 0 else 0

    return similarity

In [9]:
value = user_similarity("1","19", user_ratings_dict)
value

0.2257484502689187

In [10]:
user_a = "1"
for user_b in set(rating_source['userId'].values):
    similarity = user_similarity(user_a, user_b, user_ratings_dict)
    print("User_a: {}, User_b: {} -------> Similarity: {}".format(user_a, user_b, similarity))

User_a: 1, User_b: 593 -------> Similarity: 0.49457306049949123
User_a: 1, User_b: 539 -------> Similarity: -0.14747999474756682
User_a: 1, User_b: 563 -------> Similarity: 0.3698242309926496
User_a: 1, User_b: 148 -------> Similarity: -0.982806583276678
User_a: 1, User_b: 13 -------> Similarity: 0.947878845842068
User_a: 1, User_b: 237 -------> Similarity: 0.06001820162527469
User_a: 1, User_b: 429 -------> Similarity: 0.0884926827269392
User_a: 1, User_b: 53 -------> Similarity: 0
User_a: 1, User_b: 318 -------> Similarity: -0.014782821032474662
User_a: 1, User_b: 184 -------> Similarity: -1.0
User_a: 1, User_b: 200 -------> Similarity: 0.2505594651176311
User_a: 1, User_b: 457 -------> Similarity: 0.18739773848609756
User_a: 1, User_b: 299 -------> Similarity: 0.4170498226038546
User_a: 1, User_b: 398 -------> Similarity: 0.6657357370608938
User_a: 1, User_b: 229 -------> Similarity: -0.211941901536468
User_a: 1, User_b: 243 -------> Similarity: -0.6137110517341317
User_a: 1, User_b

### Method's definition for making prediction

In [11]:
def get_ratings_for_users(rating_source):
    ratings_for_user = {}

    users = rating_source[['userId', 'rating'] ]
    values = users.values

    for user, rating in values:
        if user in ratings_for_user:
            ratings_for_user[user].append(float(rating))
        else:
            ratings_for_user[user] = [float(rating)]
    
    for user in ratings_for_user:
        ratings_for_user[user] = np.array(ratings_for_user[user])
    
    return ratings_for_user

In [12]:
def get_users_ratings_for_movie(user_ratings_dict, movie_id):
    users = user_ratings_dict.keys()

    users_ratings_for_movie = [] 
    for user in users:
        movies_ratings = user_ratings_dict[user]
        for movie, rating  in movies_ratings:
            if movie == movie_id:
                users_ratings_for_movie.append((user, rating))
                break
    
    return users_ratings_for_movie

In [13]:
def making_prediction(user_a, movieId, user_ratings_dict):
    mean_rating_a = np.mean(get_ratings_for_user(user_a, user_ratings_dict))
    
    users_ratings_for_movie = get_users_ratings_for_movie(user_ratings_dict, movieId)
    numerator = 0
    denominator = 0
    
    for user_b, rating_user_b in users_ratings_for_movie:
        if(user_b != user_a):
            rating_for_movie_b = float(rating_user_b)
            mean_rating_b = np.mean(get_ratings_for_user(user_b, user_ratings_dict))
            similarity = user_similarity(user_a, user_b, user_ratings_dict)
            numerator += similarity * (rating_for_movie_b - mean_rating_b)
            denominator += similarity

    prediction = mean_rating_a + (numerator / abs(denominator)) if abs(denominator) > 0 else 0

    return prediction   

In [14]:
result = making_prediction("1", "2", user_ratings_dict)
result

3.5973949736369373

### Top-10 most similar user and most relevant movies

#### Top k user

In [15]:
def get_top_k_user(user_a, rating_source, user_ratings_dict, k):
    results_for_similarity = {}
    users_except = rating_source[rating_source['userId'] != user_a]['userId'].unique().tolist()
    # ratings_for_users = get_ratings_for_users(rating_source)

    for user_b in users_except:
        similarity = user_similarity(user_a, user_b, user_ratings_dict)
        print("Similarity between user_a = {} and user_b = {}: {}".format(user_a, user_b, similarity))

        if similarity in results_for_similarity:
            results_for_similarity[similarity].append(user_b)
        else:
            results_for_similarity[similarity] = [user_b]
    
    results_for_similarity_ordered = sorted(results_for_similarity.items(), reverse=True)

    top_k_users = []
    for similarity, users in results_for_similarity_ordered:
        for user in users:
            if len(top_k_users) < k:
                top_k_users.append((user, similarity))
            else:
                break
    
    return top_k_users


In [16]:
result = get_top_k_user("560",rating_source, user_ratings_dict, 10)

Similarity between user_a = 560 and user_b = 1: 0.12426374592847782
Similarity between user_a = 560 and user_b = 2: 0.4005306333243776
Similarity between user_a = 560 and user_b = 3: -0.5892201464042693
Similarity between user_a = 560 and user_b = 4: -0.1473803279031062
Similarity between user_a = 560 and user_b = 5: -0.19595009252525492
Similarity between user_a = 560 and user_b = 6: 0.025690759141180613
Similarity between user_a = 560 and user_b = 7: 0.0869094100944757
Similarity between user_a = 560 and user_b = 8: 0.06346906298143465
Similarity between user_a = 560 and user_b = 9: 0.3510537931338937
Similarity between user_a = 560 and user_b = 10: -0.16504312205202487
Similarity between user_a = 560 and user_b = 11: -0.005957440852037112
Similarity between user_a = 560 and user_b = 12: -0.8401384261652785
Similarity between user_a = 560 and user_b = 13: 0.22945097284698762
Similarity between user_a = 560 and user_b = 14: 0.11290044302438676
Similarity between user_a = 560 and user_

In [17]:
result

[('138', 1.0),
 ('245', 1.0),
 ('278', 1.0),
 ('547', 1.0),
 ('120', 0.9277604027692536),
 ('194', 0.9149401161539074),
 ('403', 0.9098529367845516),
 ('521', 0.8873925327953391),
 ('90', 0.8829482926050654),
 ('207', 0.8778955729143844)]

#### Top k movie

In [18]:
def get_unowned_movie_ids(user_id, dataframe):
    all_movie_ids = dataframe['movieId'].unique()
    
    user_movie_ids = dataframe[dataframe['userId'] == user_id]['movieId'].unique()
    
    unowned_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in user_movie_ids]
    
    return unowned_movie_ids

In [77]:
def get_top_k_movie(user_a, rating_source, user_ratings_dict, k):
    results_for_prediction = {}

    unowned_movies = get_unowned_movie_ids(user_a, rating_source)

    # ratings_for_users = get_ratings_for_users(rating_source)


    print(len(unowned_movies))
    iteration = 1

    for movieId in unowned_movies:
        prediction = making_prediction(user_a, movieId, user_ratings_dict)
        # print("{}) MovieId: {} ----> Prediction: {}".format(iteration, movieId, prediction))
        if prediction in results_for_prediction:
            results_for_prediction[prediction].append(movieId)
        else:
            results_for_prediction[prediction] = [movieId]
        iteration += 1
    
    results_for_prediction_ordered = sorted(results_for_prediction.items(), reverse=True)

    top_k_movies = []
    for prediction, movie_ids in results_for_prediction_ordered:
        for movie_id in movie_ids:
            if len(top_k_movies) < k:
                top_k_movies.append((movie_id, prediction))
            else:
                break
    
    return top_k_movies


In [81]:
def get_unowned_movie_ids(movies_ratings, movie_list):
    unowned_movie_ids = movie_list.copy()
    
    for movie, _ in movies_ratings:
        unowned_movie_ids.remove(movie)
    
    return unowned_movie_ids

In [82]:
def get_top_k_movie(user_a, movie_list, user_ratings_dict, k):
    results_for_prediction = {}

    unowned_movies = get_unowned_movie_ids(user_ratings_dict[user_a], movie_list)

    # ratings_for_users = get_ratings_for_users(rating_source)


    print(len(unowned_movies))
    iteration = 1

    for movieId in unowned_movies:
        prediction = making_prediction(user_a, movieId, user_ratings_dict)
        # print("{}) MovieId: {} ----> Prediction: {}".format(iteration, movieId, prediction))
        if prediction in results_for_prediction:
            results_for_prediction[prediction].append(movieId)
        else:
            results_for_prediction[prediction] = [movieId]
        iteration += 1
    
    results_for_prediction_ordered = sorted(results_for_prediction.items(), reverse=True)

    top_k_movies = []
    for prediction, movie_ids in results_for_prediction_ordered:
        for movie_id in movie_ids:
            if len(top_k_movies) < k:
                top_k_movies.append((movie_id, prediction))
            else:
                break
    
    return top_k_movies

In [23]:
movie_list = set(rating_source['movieId'])

In [24]:
result = get_unowned_movie_ids(user_ratings_dict["1"], movie_list)

In [25]:
len(result)

9492

In [44]:
movies = get_top_k_movie("3", movie_list, user_ratings_dict, 10)

9685


In [62]:
making_prediction("3", "113741", user_ratings_dict)

2001804916337134.5

In [45]:
movies

[('113741', 2001804916337134.5),
 ('7756', 1435.496523667771),
 ('27728', 816.0548736507894),
 ('3840', 740.7896739727911),
 ('44665', 563.8052675491517),
 ('1439', 522.734060810411),
 ('48322', 449.76836836709293),
 ('107702', 293.7862112413712),
 ('127136', 192.14510790687234),
 ('1596', 183.38159051335373)]

In [55]:
movies = get_top_k_movie("1", movie_list, user_ratings_dict, 10)

9492
1) MovieId: 2304 ----> Prediction: 3.874332650747797
2) MovieId: 84799 ----> Prediction: 3.7785930508028427
3) MovieId: 102338 ----> Prediction: 5.177823242756501
4) MovieId: 59037 ----> Prediction: 4.025671601031977
5) MovieId: 175485 ----> Prediction: 6.707082128994161
6) MovieId: 7457 ----> Prediction: 5.203728707935189
7) MovieId: 6579 ----> Prediction: 4.643136916121871
8) MovieId: 56012 ----> Prediction: 4.140995384487371
9) MovieId: 101423 ----> Prediction: 4.724329269989703
10) MovieId: 139385 ----> Prediction: 4.596870267251765
11) MovieId: 127114 ----> Prediction: 4.122352006590561
12) MovieId: 4299 ----> Prediction: 4.1921263451642
13) MovieId: 49649 ----> Prediction: 2.4699536226201
14) MovieId: 4219 ----> Prediction: 3.84086462906635
15) MovieId: 88069 ----> Prediction: 3.8393522833178
16) MovieId: 39449 ----> Prediction: 3.305714319392913
17) MovieId: 66798 ----> Prediction: 4.3966611915985
18) MovieId: 2448 ----> Prediction: 3.787447583809173
19) MovieId: 26523 ----

In [56]:
movies

[('8341', 927.438274523593),
 ('112175', 594.2060362311274),
 ('5304', 243.42959276163546),
 ('7937', 110.9038139020465),
 ('1572', 110.4038139020465),
 ('3324', 98.47815779701784),
 ('283', 78.10912077777971),
 ('7820', 73.76696876551378),
 ('2506', 49.32858551763906),
 ('93721', 40.870499686939155)]

In [58]:
somma = sum(rating for _, rating in movies)
media = somma/10
somma

2326.9348638653387

In [51]:
movies = get_top_k_movie("2", movie_list, user_ratings_dict, 10)

9695
1) MovieId: 2304 ----> Prediction: 3.4562292024719348
2) MovieId: 84799 ----> Prediction: 0
3) MovieId: 102338 ----> Prediction: 4.75971979448064
4) MovieId: 59037 ----> Prediction: 3.721263852356449
5) MovieId: 175485 ----> Prediction: 2.7709797029177317
6) MovieId: 7457 ----> Prediction: 4.7856252596593265
7) MovieId: 6579 ----> Prediction: 3.847232218805208
8) MovieId: 56012 ----> Prediction: 3.7321173090278497
9) MovieId: 101423 ----> Prediction: 3.5903259024240906
10) MovieId: 139385 ----> Prediction: 2.931991992469942
11) MovieId: 127114 ----> Prediction: 3.704248558314699
12) MovieId: 4299 ----> Prediction: 5.015927645194597
13) MovieId: 49649 ----> Prediction: 3.13379791198352
14) MovieId: 4219 ----> Prediction: 4.269286176401151
15) MovieId: 88069 ----> Prediction: 3.421248835041938
16) MovieId: 39449 ----> Prediction: 5.449544544363727
17) MovieId: 66798 ----> Prediction: 4.124524415190544
18) MovieId: 2448 ----> Prediction: 4.471936171751352
19) MovieId: 26523 ----> Pre

In [49]:
movies

[('2899', 1.0108800761479348e+16),
 ('3404', 2073055.385863287),
 ('106766', 660.8407708954104),
 ('6537', 553.0288768349349),
 ('86377', 434.8289124682445),
 ('110501', 412.9056339434671),
 ('6754', 304.7486168772332),
 ('5010', 242.20295606229226),
 ('140715', 168.27914195649956),
 ('2263', 156.44776862687974)]

### New similarity

![image.png](attachment:image.png)
### Spearman's Correlation

In [134]:
from scipy.stats import spearmanr

In [135]:
def spearman_similarity(user_a, user_b, rating_source):
    movies_user_a = set(rating_source[rating_source['userId'] == user_a]['movieId'])
    movies_user_b = set(rating_source[rating_source['userId'] == user_b]['movieId'])

    common_movies = movies_user_a.intersection(movies_user_b)
    if not common_movies:
        return 0.0

    ratings_user_a = [float(rating_source[(rating_source['userId'] == user_a) & (rating_source['movieId'] == movie)]['rating'].values[0]) for movie in common_movies]
    ratings_user_b = [float(rating_source[(rating_source['userId'] == user_b) & (rating_source['movieId'] == movie)]['rating'].values[0]) for movie in common_movies]

    rank_user_a = [sorted(ratings_user_a).index(r) + 1 for r in ratings_user_a]
    rank_user_b = [sorted(ratings_user_b).index(r) + 1 for r in ratings_user_b]

    similarity, _ = spearmanr(rank_user_a, rank_user_b)
    
    return similarity

In [136]:
result = spearman_similarity("1", "19", rating_source)

In [137]:
result

0.3015994824819675

# Assignment 2

In [27]:
users = ["474", "599", "414"]

In [28]:
movie_list = set(rating_source['movieId'])

In [29]:
def get_movies_for_users(user_ids, rating_source):
    movies_sets = [set(rating_source[rating_source['userId'] == user_id]['movieId']) for user_id in user_ids]
    return movies_sets

In [30]:
def get_unwatched_movies(user_id, rating_source, movie_list):
    watched_movies = set(rating_source[rating_source['userId'] == user_id]['movieId'])
    unwatched_movies = list(set(movie_list) - watched_movies)
    return set(unwatched_movies)

In [31]:
def get_intersection_movies_for_users(user_ids, movie_list, rating_source):
    movies_sets = [get_unwatched_movies(user, rating_source, movie_list) for user in user_ids]
    union_movies = set.intersection(*movies_sets)
    return union_movies

In [32]:
def get_rating_if_user_contains_movie(user, movieId, user_ratings_dict):
    movies_ratings = user_ratings_dict[user]

    for movie, rating in movies_ratings:
        if movie == movieId:
            return float(rating)
    
    return None


In [26]:
def average_aggregation(users, movie_list, user_ratings_dict, k, rating_source):
    movies = get_intersection_movies_for_users(users, movie_list, rating_source)
    print(len(movies))
    
    iteration = 1
    movies_to_average_rating = {}
    for movie in movies: 
        sum_for_movie = 0
        for user in users:
            rating_user = get_rating_if_user_contains_movie(user, movie, user_ratings_dict)
            if rating_user == None:
                prediction_for_movie = making_prediction(user, movie, user_ratings_dict)
                sum_for_movie += prediction_for_movie
            else:
                sum_for_movie += rating_user
        
        movies_to_average_rating[movie] = (sum_for_movie/len(users))
        print("{}) Movie: {} ----> Average Rating: {}".format(iteration, movie, movies_to_average_rating.get(movie)))
        iteration += 1
    
    movies_to_average_rating_ordered = dict(sorted(movies_to_average_rating.items(), key=lambda item: item[1], reverse=True))

    top_k_movies_recommend = dict(list(movies_to_average_rating_ordered.items())[:k])

    return top_k_movies_recommend

In [27]:
def least_misery_aggregation(users, movie_list, user_ratings_dict, k, rating_source):
    movies = get_intersection_movies_for_users(users, movie_list, rating_source)
    movies_to_min_rating = {}

    print(len(movies))
    iteration = 1
    for movie in movies: 
        ratings_for_movie = []
        for user in users:
            rating_user = get_rating_if_user_contains_movie(user, movie, user_ratings_dict)
            if rating_user == None:
                prediction_for_movie = making_prediction(user, movie, user_ratings_dict)
                ratings_for_movie.append(prediction_for_movie)
            else:
                ratings_for_movie.append(rating_user)
        
        movies_to_min_rating[movie] = min(ratings_for_movie)
        print("{}) Movie: {} ----> Min Rating: {}".format(iteration, movie, movies_to_min_rating.get(movie)))
        iteration += 1
    
    movies_to_min_rating_ordered = dict(sorted(movies_to_min_rating.items(), key=lambda item: item[1], reverse=True))

    top_k_movies_recommend = dict(list(movies_to_min_rating_ordered.items())[:k])

    return top_k_movies_recommend 

In [74]:
result = average_aggregation(users, movie_list, user_ratings_dict, 10, rating_source)

5017
1) Movie: 111375 ----> Average Rating: 2.304537852359719
2) Movie: 6886 ----> Average Rating: 3.263263865370998
3) Movie: 65359 ----> Average Rating: 2.933913894274461
4) Movie: 74754 ----> Average Rating: 2.879367209280447
5) Movie: 1453 ----> Average Rating: 1.492039056484117
6) Movie: 4756 ----> Average Rating: 1.6215687296164942
7) Movie: 3048 ----> Average Rating: 2.0913142097982256
8) Movie: 4325 ----> Average Rating: 3.027977643788756
9) Movie: 92475 ----> Average Rating: 4.3410443349304515
10) Movie: 189713 ----> Average Rating: 3.446518936291268
11) Movie: 83601 ----> Average Rating: -0.31351019120773893
12) Movie: 2661 ----> Average Rating: 2.368555117380374
13) Movie: 5882 ----> Average Rating: 2.478351570370498
14) Movie: 101884 ----> Average Rating: 1.7969498894582723
15) Movie: 1337 ----> Average Rating: 4.248668960180422
16) Movie: 138036 ----> Average Rating: 3.281729786862778
17) Movie: 121715 ----> Average Rating: 3.304537852359719
18) Movie: 2905 ----> Average R

In [75]:
result

{'167018': 55.56410002057211,
 '7067': 21.248288690679583,
 '160567': 18.300643768557187,
 '159441': 14.714774121282977,
 '165947': 10.64466493334079,
 '7564': 10.217913283408992,
 '2102': 9.866777798299685,
 '55620': 9.810002343167646,
 '114060': 9.224534680422598,
 '7209': 8.62394561527636}

In [56]:
result = least_misery_aggregation(users, movie_list, user_ratings_dict, 10, rating_source)

5017
1) Movie: 111375 ----> Min Rating: 1.802266758621379
2) Movie: 6886 ----> Min Rating: 2.760992771632658
3) Movie: 65359 ----> Min Rating: 2.4316428005361206
4) Movie: 74754 ----> Min Rating: 2.6029653085705333
5) Movie: 1453 ----> Min Rating: 0.854013336480727
6) Movie: 4756 ----> Min Rating: 1.078682114208189
7) Movie: 3048 ----> Min Rating: 1.6258523423028917
8) Movie: 4325 ----> Min Rating: 2.525706550050416
9) Movie: 92475 ----> Min Rating: 3.8254066477009787
10) Movie: 189713 ----> Min Rating: 2.4923629501428364
11) Movie: 83601 ----> Min Rating: -0.8157812849460795
12) Movie: 2661 ----> Min Rating: 1.8662840236420335
13) Movie: 5882 ----> Min Rating: 1.9428357532573999
14) Movie: 101884 ----> Min Rating: 1.2946787957199322
15) Movie: 1337 ----> Min Rating: 3.7463978664420816
16) Movie: 138036 ----> Min Rating: 2.736682184759381
17) Movie: 121715 ----> Min Rating: 2.802266758621379
18) Movie: 2905 ----> Min Rating: 3.555277841061126
19) Movie: 25771 ----> Min Rating: 4.034547

In [61]:
prediction = making_prediction("414", "159441", user_ratings_dict)
prediction

20.649985032137753

In [58]:
result

{'159441': 11.447748742400682,
 '165947': 8.416539224421216,
 '160567': 6.396320265488315,
 '7067': 6.154319498668819,
 '6967': 6.066619005872367,
 '7114': 6.066619005872367,
 '5105': 6.066619005872367,
 '7742': 6.066619005872367,
 '946': 5.825524982000262,
 '3604': 5.6150230133280985}

### Method that considered the disagreements between the users in a group


In [30]:
def get_recommendation_for_user(user, movies, user_ratings_dict):
    movie_for_rating = []
    
    for movie in movies:
        rating_user = get_rating_if_user_contains_movie(user, movie, user_ratings_dict)
        if rating_user == None:
            prediction = making_prediction(user, movie, user_ratings_dict)
            movie_for_rating.append((movie, prediction))
        else:
            movie_for_rating.append((movie, rating_user))
    
    movie_for_rating_ordered = dict(sorted(movie_for_rating, key=lambda item: item[1], reverse=True))
    
    return list(movie_for_rating_ordered.keys())

In [31]:
movies = get_intersection_movies_for_users(users, movie_list, rating_source)

In [32]:
result = get_recommendation_for_user("474", movies, user_ratings_dict)

In [99]:
def kendall_tau_distance(recommend_movies_a, recommend_movies_b):
    distance = 0
    for i in range(len(recommend_movies_a)):
        movie = recommend_movies_a[i]
        for j in range(len(recommend_movies_b)):
            if i == j and recommend_movies_b[j] != movie:
                index = recommend_movies_b.index(movie) 
                different_movie = recommend_movies_b[j]
                recommend_movies_b[j] = movie
                recommend_movies_b[index] = different_movie
                distance += 1
    
    return distance, (recommend_movies_a, recommend_movies_b)



In [100]:
recommend_movies_a = get_recommendation_for_user("474", movies, user_ratings_dict)
recommend_movies_b = get_recommendation_for_user("414", movies, user_ratings_dict)

In [101]:
distance, coppia = kendall_tau_distance(recommend_movies_a, recommend_movies_b)

In [33]:
def divide_dict_values(d, divisor):
    for key in d:
        d[key] = 1-(d[key]/divisor)
    return d

In [41]:
def average_weighted_aggregation(users, movie_list, user_ratings_dict, k):
    movies = get_intersection_movies_for_users(users, movie_list, rating_source)

    recommend_for_user = {user: list(get_recommendation_for_user(user, movies, user_ratings_dict)) for user in users}
    total_distance_sum = 0

    user_distances = {}
    for i in range(len(users)):
        for j in range(i+1, len(users)):
            user_a = users[i]
            user_b = users[j]
            tau, _ = kendall_tau_distance(recommend_for_user[user_a], recommend_for_user[user_b])
            user_distances[(user_a, user_b)] = tau
            total_distance_sum += tau
    
    user_to_weight = {}
    for (user_a, user_b), distance in user_distances.items():
        user_to_weight.setdefault(user_a, 0)
        user_to_weight.setdefault(user_b, 0)
        user_to_weight[user_a] += distance
        user_to_weight[user_b] += distance
    
    user_to_weight = divide_dict_values(user_to_weight, total_distance_sum)
    
    weighted_movie_averages = {}
    for movie in movies:
        weighted_sum = 0
        total_weight = 0
        for user in users:
            weight = user_to_weight.get(user, 0)
            rating_user = get_rating_if_user_contains_movie(user, movie, user_ratings_dict)
            if rating_user != None:
                rating = rating_user
            else:
                rating = making_prediction(user, movie, user_ratings_dict) 
            weighted_sum += rating * weight
            total_weight += weight
        weighted_average = weighted_sum / total_weight if total_weight != 0 else 0
        weighted_movie_averages[movie] = weighted_average
        print("MovieId: {}  ---->  Weighted Average Rating: {}".format(movie, weighted_average))

    top_k_recommendations = sorted(weighted_movie_averages.items(), key=lambda x: x[1], reverse=True)[:k]

    return top_k_recommendations

In [120]:
result = average_weighted_aggregation(users, movie_list, user_ratings_dict, 10)

MovieId: 111375  ---->  Weighted Average Rating: 2.1743177695916325
MovieId: 6886  ---->  Weighted Average Rating: 3.1330437826029116
MovieId: 65359  ---->  Weighted Average Rating: 2.803693811506374
MovieId: 74754  ---->  Weighted Average Rating: 3.0149248025408415
MovieId: 1453  ---->  Weighted Average Rating: 1.251431492163663
MovieId: 4756  ---->  Weighted Average Rating: 1.4533572596437316
MovieId: 3048  ---->  Weighted Average Rating: 1.8746602660981144
MovieId: 4325  ---->  Weighted Average Rating: 2.8977575610206694
MovieId: 92475  ---->  Weighted Average Rating: 4.164182195571053
MovieId: 189713  ---->  Weighted Average Rating: 3.920694457918785
MovieId: 83601  ---->  Weighted Average Rating: -0.44373027397582626
MovieId: 2661  ---->  Weighted Average Rating: 2.2383350346122866
MovieId: 5882  ---->  Weighted Average Rating: 2.7431114561737697
MovieId: 101884  ---->  Weighted Average Rating: 1.6667298066901852
MovieId: 1337  ---->  Weighted Average Rating: 4.118448877412336
Mov

In [121]:
result

[('7067', 28.66409244071412),
 ('160567', 22.849669324906962),
 ('159441', 16.013250165023344),
 ('7564', 13.052425856194084),
 ('55620', 12.994826860293323),
 ('165947', 11.496180956059405),
 ('946', 9.711083556424773),
 ('105211', 6.629580559508494),
 ('6967', 6.438670016842621),
 ('7114', 6.438670016842621)]

# Assignment 3

### GR: A sequence of μ group recommendations (Gr1,…,Grμ)
### • pj(ui,dz): preference score of user ui for item dz at iteration j 
### • gpj(G,dz): preference score of item dz for the group G as a whole, as 
### estimated by the group recommender at iteration j 

In [94]:
users = ["474", "599", "414"]

In [96]:
def hybrid_aggregation(users, users_weight, user_ratings_dict, k, alfa, rating_source):
    movies = get_intersection_movies_for_users(users, movie_list, rating_source)
    
    movies_to_rating = {}
    iteration = 1
    for movie in movies: 
        ratings_for_movie = []
        sum_for_movie = 0
        total_weight = 0

        for user in users:
            rating_user = get_rating_if_user_contains_movie(user, movie, user_ratings_dict)
            if rating_user == None:
                prediction_for_movie = making_prediction(user, movie, user_ratings_dict)
                rating_user = prediction_for_movie

            weight = (1-users_weight[user])
            sum_for_movie += weight * rating_user
            ratings_for_movie.append(weight * rating_user)
            total_weight += weight

        
        movies_to_rating[movie] = (1-alfa) * (sum_for_movie/total_weight) + alfa * (min(ratings_for_movie))
        # print("{}) Movie: {} ----> Average Rating: {}".format(iteration, movie, movies_to_rating.get(movie)))
        iteration += 1
    
    movies_to_rating_ordered = dict(sorted(movies_to_rating.items(), key=lambda item: item[1], reverse=True))

    top_k_movies_recommend = dict(list(movies_to_rating_ordered.items())[:k])


    return top_k_movies_recommend

In [90]:
def get_group_list_sat(user, recommend_j, user_ratings_dict):
    total_preference_score = 0

    for movie in recommend_j.keys():
        rating_user = making_prediction(user, movie, user_ratings_dict)
        total_preference_score += rating_user

    return total_preference_score 


In [91]:
def get_user_list_sat(top_k_movie):
    total_preference_score = 0

    for _, rating in top_k_movie:
        total_preference_score += rating
    
    return total_preference_score


In [69]:
def sequential_hybrid_aggregation(users, iterations, k, alfa, user_ratings_dict, rating_source, movie_list):
    user_for_satisfaction = {}
    for user in users:
        user_for_satisfaction[user] = 0.5
    
    top_k_movie_for_user = {}
    for user in users:
        top_k_movie_for_user[user] = get_top_k_movie(user, movie_list, user_ratings_dict, k)
        print("User: {}  |  Top {} movie: {}".format(user, k, top_k_movie_for_user[user]))

    satisfaction_for_iteration = {}
    for i in range(iterations):
        print("Inizio iterazione {}".format(i))
        recommend = hybrid_aggregation(users, user_for_satisfaction, user_ratings_dict, k, alfa, rating_source)

        satisfaction_for_iteration[i] = []
        for user in users:
            group_list_sat = get_group_list_sat(user, recommend, user_ratings_dict)
            user_list_sat = get_user_list_sat(top_k_movie_for_user[user])
            satisfaction = group_list_sat/user_list_sat
            user_for_satisfaction[user] = satisfaction
            satisfaction_for_iteration.get(i).append((user, satisfaction))
            print("User: {}  |  Satisfaction: {}".format(user, satisfaction))
        
        min_user = min(user_for_satisfaction, key=user_for_satisfaction.get)
        user_for_satisfaction[min_user] = 1
        print(user_for_satisfaction.values())

    return satisfaction_for_iteration
      

In [97]:
def update_alfa(user_for_satisfaction):
    alfa = max(user_for_satisfaction.values()) - min(user_for_satisfaction.values())
    return alfa

In [98]:
def sequential_hybrid_aggregation(users, iterations, k, user_ratings_dict, rating_source, movie_list):
    user_for_satisfaction = {}
    for user in users:
        user_for_satisfaction[user] = 0.5
    
    top_k_movie_for_user = {}
    for user in users:
        top_k_movie_for_user[user] = get_top_k_movie(user, movie_list, user_ratings_dict, k)
        print("User: {}  |  Top {} movie: {}".format(user, k, top_k_movie_for_user[user]))

    satisfaction_for_iteration = {}
    for i in range(iterations):
        alfa = update_alfa(user_for_satisfaction)

        print("Inizio iterazione {} ----> Alfa: {}".format(i, alfa))
        recommend = hybrid_aggregation(users, user_for_satisfaction, user_ratings_dict, k, alfa, rating_source)

        satisfaction_for_iteration[i] = []
        for user in users:
            group_list_sat = get_group_list_sat(user, recommend, user_ratings_dict)
            user_list_sat = get_user_list_sat(top_k_movie_for_user[user])
            satisfaction = group_list_sat/user_list_sat
            user_for_satisfaction[user] = satisfaction
            satisfaction_for_iteration.get(i).append((user, satisfaction))
            print("User: {}  |  Satisfaction: {}".format(user, satisfaction))
        
        # min_user = min(user_for_satisfaction, key=user_for_satisfaction.get)
        # user_for_satisfaction[min_user] = 1
        print(user_for_satisfaction.values())

    return satisfaction_for_iteration

In [72]:
users = ["1", "2", "3"]

In [99]:
satisfaction_for_iteration = sequential_hybrid_aggregation(["345", "189", "538"], 3, 10, user_ratings_dict, rating_source, movie_list)

9662
User: 345  |  Top 10 movie: [('49284', 538.4394568990728), ('61350', 504.32135020079625), ('282', 418.26517625188507), ('8967', 245.5069817587504), ('190', 240.4369192228162), ('155509', 147.75010605770086), ('5034', 135.94502704739588), ('80693', 129.88080991767376), ('5909', 107.78353110048737), ('97860', 93.21417398277005)]
9704
User: 189  |  Top 10 movie: [('54771', 449.9877073666121), ('82169', 231.91038218471078), ('4248', 207.27086001725073), ('39449', 150.85176019281406), ('55247', 141.31959778345194), ('2316', 130.64100920190728), ('3955', 123.97112557888502), ('122902', 122.44493052046028), ('90888', 111.47387374741614), ('58554', 110.21722499079327)]
9687
User: 538  |  Top 10 movie: [('41997', 687.0228710246006), ('4316', 410.5874969149787), ('2664', 361.6897544196459), ('2736', 284.81568396742455), ('6963', 205.30804063459303), ('1016', 153.17610672752804), ('81456', 145.78652592667777), ('4617', 143.4855519200841), ('3879', 125.22545437897942), ('2502', 106.0135794869

In [101]:
import json

In [118]:
def write_to_file(satisfaction_for_iteration, group_satisfaction, disagreement):  
    with open("Experiments.txt", 'w') as f:
            for key, values in satisfaction_for_iteration.items():
                f.write(f"Iteration {key}\n")
                for value in values:
                    f.write(f"User: {value[0]}  ----> Satisfaction: {value[1]}\n")
            f.write("\nGroup Satisfaction: {}\nDisagreement: {}\n".format(group_satisfaction, disagreement))

In [114]:
def get_overall_user_satisfaction(user, satisfaction_for_iteration):
    overall_satisfaction = 0

    for _, satisfactions_users in satisfaction_for_iteration.items():
        for u, satisfaction in satisfactions_users:
            if u == user:
                overall_satisfaction += satisfaction
                break
    
    return overall_satisfaction/len(satisfaction_for_iteration)

In [115]:
def get_group_satisfaction_and_disagreement(users, satisfaction_for_iteration):
    overall_group_satisfacion = 0
    overall_satisfactions = []

    for user in users:
        overall_user_satisfaction = get_overall_user_satisfaction(user, satisfaction_for_iteration)
        overall_group_satisfacion += overall_user_satisfaction
        overall_satisfactions.append(overall_user_satisfaction)

    return (overall_user_satisfaction/(len(users))), (max(overall_satisfactions) - min(overall_satisfactions))

In [123]:
group_satisfaction, disagreement = get_group_satisfaction_and_disagreement(["345", "189", "538"], satisfaction_for_iteration)

In [124]:
group_satisfaction

0.2125180476824751

### Experiments

In [127]:
users_set = [["474", "599", "414"], ["45", "435", "572"], ["127", "256", "12"], ["549", "478", "82"], 
                ["601", "331", "393"], ["589", "200", "348"], ["8", "198", "406"], ["96", "441", "459"],
                ["28", "51", "585"], ["415", "587", "106"]]

In [128]:
def main():
    for users in users_set:
        satisfaction_for_iteration = sequential_hybrid_aggregation(users, 3, 10, user_ratings_dict, rating_source, movie_list)
        group_satisfaction, disagreement = get_group_satisfaction_and_disagreement(users, satisfaction_for_iteration)
        write_to_file(satisfaction_for_iteration, group_satisfaction, disagreement)
        

In [None]:
main

In [97]:
def get_users_ratings_for_movie(user_ratings_dict, movie_id):
    users = user_ratings_dict.keys()

    users_ratings_for_movie = {}
    for user in users:
        movies_ratings = user_ratings_dict[user]
        for movie, rating  in movies_ratings:
            if movie == movie_id:
                users_ratings_for_movie[user] = rating
                break
    
    return users_ratings_for_movie

In [98]:
def making_prediction_modified(user_a, movieId, user_ratings_dict):
    mean_rating_a = np.mean(get_ratings_for_user(user_a, user_ratings_dict))
    
    users_ratings_for_movie = get_users_ratings_for_movie(user_ratings_dict, movieId)
    top_k_similar_users = get_top_k_user(user_a, rating_source, user_ratings_dict, 100)

    top_k_similar_users = [similar_user[0] for similar_user in top_k_similar_users]
    filtered_ratings = {user: rating for user, rating in users_ratings_for_movie.items() if user in top_k_similar_users}

    numerator = 0
    denominator = 0
    
    for user_b, rating_user_b in filtered_ratings.items():
        if(user_b != user_a):
            rating_for_movie_b = float(rating_user_b)
            mean_rating_b = np.mean(get_ratings_for_user(user_b, user_ratings_dict))
            similarity = user_similarity(user_a, user_b, user_ratings_dict)
            numerator += similarity * (rating_for_movie_b - mean_rating_b)
            denominator += similarity

    prediction = mean_rating_a + (numerator / abs(denominator)) if abs(denominator) > 0 else 0

    return prediction

In [None]:
def get_top_k_movie(user_a, movie_list, user_ratings_dict, k):
    results_for_prediction = {}

    unowned_movies = get_unowned_movie_ids(user_ratings_dict[user_a], movie_list)

    # ratings_for_users = get_ratings_for_users(rating_source)


    print(len(unowned_movies))
    iteration = 1

    for movieId in unowned_movies:
        prediction = making_prediction_modified(user_a, movieId, user_ratings_dict)
        # print("{}) MovieId: {} ----> Prediction: {}".format(iteration, movieId, prediction))
        if prediction in results_for_prediction:
            results_for_prediction[prediction].append(movieId)
        else:
            results_for_prediction[prediction] = [movieId]
        iteration += 1
    
    results_for_prediction_ordered = sorted(results_for_prediction.items(), reverse=True)

    top_k_movies = []
    for prediction, movie_ids in results_for_prediction_ordered:
        for movie_id in movie_ids:
            if len(top_k_movies) < k:
                top_k_movies.append((movie_id, prediction))
            else:
                break
    
    return top_k_movies