(a) For producing group recommendation, we will use the user-based collaborative 
filtering approach as this implemented in Assignment 1. Specifically, for producing group 
recommendations, we will first compute the movies recommendations for each user in 
the group, and then we will aggregate the lists of the individual users, so as to produce a 
single list of movies for the group.  

In [6]:
import pandas as pd
import numpy as np
import csv

In [7]:
dataset_links = pd.read_csv('ml-latest-small/links.csv')
dataset_movies = pd.read_csv('ml-latest-small/movies.csv')
dataset_ratings = pd.read_csv('ml-latest-small/ratings.csv')
dataset_tags = pd.read_csv('ml-latest-small/tags.csv')

rows_num = len(dataset_ratings)
print("Total number of rows:", rows_num)

first5rows = dataset_ratings.head(5)
print("\nFirst five rows:")
print(first5rows)

Total number of rows: 100836

First five rows:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [8]:
''' a function to save the data present in the ratings.csv file in a dictionary of dictionaries. 
In the outermost dictionary the keys are the user ids, and each of them is associated
with a dictionary in which the keys are the movie ids and the values are the score 
assigned by that user. '''
def csv_to_dict(filename):
    users_ratings = {}
    with open(filename, newline='') as file:
        reader = csv.reader(file)
        next(reader)  # to skip the heading
        for row in reader:
            userId, movieId, rating, _ = row
            userId = int(userId)
            movieId = int(movieId)
            rating = float(rating)
            if userId not in users_ratings:
                users_ratings[userId] = {}
            users_ratings[userId][movieId] = rating
    return users_ratings

In [9]:
# a function to calculate the mean of values in a dict
def mean(dict): 
    values = list(dict.values())
    values_sum = sum(values)
    values_num = len(values)
    mean = values_sum / values_num
    return mean

In [10]:
# Pearson Correlation function
def simPearson(user1_id, user2_id, filename = 'ml-latest-small/ratings.csv'):

    users_ratings = csv_to_dict(filename)
    user1_ratings = users_ratings[user1_id]
    user2_ratings = users_ratings[user2_id]

    # mean of ratings of the two users (ra,rb)
    mean_user1 = mean(user1_ratings)
    mean_user2 = mean(user2_ratings)

    # estract the common keys (common movies)
    common_movies = set(user1_ratings.keys()) & set(user2_ratings.keys())
    
    if not common_movies:
        return 0
    
    # calculate the differences from the averages for each user
    diff_user1 = np.array([user1_ratings[movie] - mean_user1 for movie in common_movies])
    diff_user2 = np.array([user2_ratings[movie] - mean_user2 for movie in common_movies])
    
    # numerator
    numerator = np.sum(diff_user1 * diff_user2)
    
    # calculate the standard deviations of the ratings for each user
    std_dev_user1 = np.sqrt(np.sum(diff_user1**2))
    std_dev_user2 = np.sqrt(np.sum(diff_user2**2))
    
    # denominator
    denominator = std_dev_user1 * std_dev_user2
    
    if denominator == 0:
        return 0
    
    # Pearson similarity
    similarity = numerator / denominator
    
    return similarity

In [11]:
''' This function creates a matrix with rows the users and columns the movies 
and the values in the cells are the ratings '''
def createMatrix():
    # create a list (all_movie_ids) in which I insert (by reading the movies.csv file) all the ids of all the movies
    all_movie_ids = []
    with open('ml-latest-small/movies.csv', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            all_movie_ids.append(int(row[0]))
    # store all the ratings from the ratings.csv file in the dataframe (ratings).
    ratings = pd.read_csv('ml-latest-small/ratings.csv')

    # create a table from the data in the ratings DataFrame. 
    # The data is organized so that rows represent users (userId), columns represent movies (movieId), and values ​​are ratings.
    user_movie_ratings_matrix = pd.pivot_table(ratings, values='rating', index='userId', columns='movieId')
    user_movie_ratings_matrix = user_movie_ratings_matrix.reindex(columns=all_movie_ids)
    return user_movie_ratings_matrix

In [12]:
''' Starting from a user x, this function creates a dictionary in which the keys are the ids
 of the other users and the values are the similarity (exploiting Pearson similarity) between 
 user x and the key user '''
def compute_user_similarities(user_id):
    # id users: from 1 to 610
    user_list = list(range(1, 611, 1))
    user_list.remove(user_id)
    user_similarities = {}
    # calculate the similarity and adds it to the 'user_similarities' dictionary
    for other_user in user_list:
        if other_user != user_id:
            similarity = simPearson(user_id, other_user)
            user_similarities[other_user] = similarity
    return user_similarities

In [13]:
''' this function has the same purpose as the 'prediction' function but is faster
 because it uses the two data structures: ratings matrix and similarity dictionary '''
def fastPrediction(user_id, movie, user_movie_ratings_matrix, user_similarity_dict):
    # if the movie has already been rated by the user, the prediction is not made and the score assigned to it is returned
    if not np.isnan(user_movie_ratings_matrix.at[user_id, movie]):
        return user_movie_ratings_matrix.at[user_id, movie]
    
    num, den = 0.0, 0.0
    # mean of ratings of user target
    rmean_user1 = user_movie_ratings_matrix.loc[user_id].mean()
    
    #select only the users who rated the movie
    users_who_rated_movie = user_movie_ratings_matrix[movie].dropna().index.unique()

    #calcolate the prediction using ratings matrix and similarity dictionary
    for u in users_who_rated_movie:
        rmean_u = user_movie_ratings_matrix.loc[u].mean()
        r_up = user_movie_ratings_matrix.at[u, movie]
        similarity = user_similarity_dict[u]
        num += similarity * (r_up - rmean_u)
        den += abs(similarity)
    if den == 0.0:
        pred = rmean_user1
    else:
        pred = rmean_user1 + (num / den)
    return pred

In [37]:
# function that takes a user and a list of movies as input and returns the preditions list about these movies
def getUnratedMoviePredictions(user_id, unrated_movies, ratings_matrix, filename = 'ml-latest-small/ratings.csv'):
    predictions_list = []
    # build a dictionary of user similarities with the input user.
    similarity_dict = compute_user_similarities(user_id)
    # for each unrated movie calculate the prediction and add it to the predictions_list: a list of tuples (movie_id, prediction for this movie)
    for movie_id in unrated_movies:
        prediction_value = fastPrediction(user_id, movie_id, ratings_matrix, similarity_dict)
        predictions_list.append((movie_id, prediction_value))
        
    return predictions_list

In [28]:
# a function that, given a user id, returns all the movies they haven't rated.
def getUnratedMovies(user_id, filename = 'ml-latest-small/ratings.csv'):
    users_ratings = csv_to_dict(filename)
    # populate the 'unrated_movie' list with the ids of the movies to which the user has not assigned a rating
    unrated_movies = []
    with open('ml-latest-small/movies.csv', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            unrated_movies.append(int(row[0]))
    unrated_movies = [x for x in unrated_movies if x not in users_ratings[user_id]]
    return unrated_movies

In [29]:
# a function that, given a list of users as input, returns the union of unrated movies for each user.
def getUnionUnratedMovies(users):
    union_unrated_movies = set()
    # for each user call the function getUnratedMovies, which returns the movies not rated by that user
    for user in users:
        unrated_movies = getUnratedMovies(user)
        #and update the list to store all of them
        union_unrated_movies.update(unrated_movies)
    return list(union_unrated_movies)

(a1) Average Aggregation

In [50]:
# function that takes as input a set of users and an integer k, and returns the top k movies suggested by the system for the group of users
def meanTopKGroupPrediction(users, k, filename = 'ml-latest-small/ratings.csv'):
    # union of unrated movies
    unrated_movies = getUnionUnratedMovies(users)
    #ratings matrix
    ratings_matrix = createMatrix()
    
    # for each user calculate the predictions and store them
    predictions_lists = []
    for user_id in users:
        print("Generating predictions for user ", user_id)
        # get the predictions for the unrated movies for the current user
        predictions = getUnratedMoviePredictions(user_id, unrated_movies, ratings_matrix)
        # add the predictions for the current user to the general list
        predictions_lists.append(predictions)
    
    # list to store the averages of ratings for each movie
    average_predictions = []
    for i in range(len(predictions_lists[0])):
        # calculate the average ratings for each movie
        average_value = sum(prediction[i][1] for prediction in predictions_lists) / len(predictions_lists)

        # build the list of tuples (movie_id, average rating)
        average_predictions.append((predictions_lists[0][i][0], average_value))

    # sort the list based on the average rating (in descending order)
    sorted_predictions = sorted(average_predictions, key=lambda x: x[1], reverse=True)
    
    # take only the first k elements from the sorted list
    top_k_predictions = sorted_predictions[:k]
    
    return top_k_predictions

In [51]:
users = [414,474,599]
print(meanTopKGroupPrediction(users,10))

Generating predictions for user  414
Generating predictions for user  474


(a2) Least Misery Aggregation

In [42]:
# function that takes as input a set of users and an integer k, and returns the top k movies suggested by the system for the group of users
def LMTopKGroupPrediction(users, k, filename = 'ml-latest-small/ratings.csv'):
    # union of unrated movies
    unrated_movies = getUnionUnratedMovies(users)
    #ratings matrix
    ratings_matrix = createMatrix()
    
    # for each user calculate the predictions and store them
    predictions_lists = []
    for user_id in users:
        print("Generating predictions for user ", user_id)
        # get the predictions for the unrated movies for the current user
        predictions = getUnratedMoviePredictions(user_id, unrated_movies, ratings_matrix)
        # add the predictions for the current user to the general list
        predictions_lists.append(predictions)
    
    # list to store the minimum of ratings for each movie
    lm_predictions = []
    for i in range(len(predictions_lists[0])):
        # calculate the minimum ratings for each movie
        min_value = min(prediction[i][1] for prediction in predictions_lists)

        # build the list of tuples (movie_id, minimum rating)
        lm_predictions.append((predictions_lists[0][i][0], min_value))

    # sort the list based on the minimum rating (in descending order)
    sorted_predictions = sorted(lm_predictions, key=lambda x: x[1], reverse=True)
    
    # take only the first k elements from the sorted list
    top_k_predictions = sorted_predictions[:k]
    
    return top_k_predictions

In [43]:
users = [414,474,599]
print(LMTopKGroupPrediction(users,10))

Generating predictions for user  414
Generating predictions for user  474
Generating predictions for user  599
[(5105, 6.066619005872367), (6967, 6.066619005872367), (7114, 6.066619005872367), (7742, 6.066619005872367), (3604, 5.6150230133280985), (97024, 5.420621468926553), (138186, 5.402772387984922), (107013, 5.402772387984922), (173307, 5.402772387984922), (86068, 5.402772387984922)]


(b) The methods employed in part (a) of Assignment 2, do not consider any disagreements between the users in the group. In part (b) of Assignment 2, define a way for counting the disagreements between the users in a group, and propose a method that takes disagreements into account when computing suggestions for the group. Implement your method and explain why it is useful when producing group recommendations. Use again the group of 3 users, and for this group, show the top-10 recommendations, i.e., the 10 movies with the highest
prediction scores that your method suggests. Use the MovieLens 100K rating dataset.

In [47]:
# function that takes as input a set of users and an integer k, and returns the top k movies suggested by the system for the group of users
def disagreementsTopKGroupPrediction(users, k):
    # Ottieni l'insieme di tutti i film non valutati per ciascun utente nel gruppo
    unrated_movies = getUnionUnratedMovies(users)

    # Genera la matrice delle valutazioni (utenti-film)
    user_movie_ratings_matrix = createMatrix()

    # Lista per memorizzare le predizioni per ciascun utente nel gruppo
    predictions_lists = []
    for user_id in users:
        print("Generating predictions for user ", user_id)
        # Ottieni le predizioni per i film non valutati per l'utente corrente
        predictions = getUnratedMoviePredictions(user_id, unrated_movies, user_movie_ratings_matrix, )
        # Aggiungi le predizioni per l'utente corrente alla lista generale
        predictions_lists.append(predictions)

    # Lista per memorizzare le predizioni medie e i disaccordi
    average_disagreements = []

    print("Calculating mean predictions and disagreements...")
    # Per ogni posizione (indice) nelle tuple
    for i in range(len(predictions_lists[0])):
        # Calcola la media dei secondi valori nelle tuple alla stessa posizione
        average_value = sum(prediction[i][1] for prediction in predictions_lists) / len(predictions_lists)

        # Conta il numero di disaccordi
        disagreement_count = sum(1 for prediction in predictions_lists if prediction[i][1] < average_value - 1)

        # Aggiungi la tupla con il film e la media calcolata
        average_disagreements.append((predictions_lists[0][i][0], average_value, disagreement_count))

    # Seleziona solo i film senza disaccordi (disaccordi = 0)
    no_disagreement_predictions = [prediction for prediction in average_disagreements if prediction[2] == 0]

    # Ordina la lista dei film senza disaccordi in base alla media
    sorted_predictions = sorted(no_disagreement_predictions, key=lambda x: x[1], reverse=True)

    # Prendi solo i primi k elementi dalla lista ordinata
    top_k_predictions = sorted_predictions[:k]

    return top_k_predictions


In [48]:
users = [414,474,599]
print(disagreementsTopKGroupPrediction(users,10))

Generating predictions for user  414
Generating predictions for user  474
Generating predictions for user  599
Calculating mean predictions and disagreements...
[(5105, 6.568890099610708, 0), (6967, 6.568890099610708, 0), (7114, 6.568890099610708, 0), (7742, 6.568890099610708, 0), (3604, 6.117294107066439, 0), (97024, 5.922892562664894, 0), (138186, 5.905043481723262, 0), (107013, 5.905043481723262, 0), (173307, 5.905043481723262, 0), (86068, 5.905043481723262, 0)]
