In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.linear_model import LinearRegression

# Step 1: Load and Explore the Dataset

In [None]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv")

In [None]:
df_ratings.info()

In [None]:
df_ratings.describe()

In [None]:
df_movies = pd.read_csv("ml-latest-small/movies.csv")

In [None]:
df_movies.info()

# Step 2: Analyze the Sparsity of the Rating Matrix

#### Construct a user-item interaction matrix where rows represent users and columns represent movies, with ratings as values.

In [None]:
df_pivot = df_ratings.pivot(index="userId", columns="movieId", values="rating")
df_pivot.info()

In [None]:
# Contare il numero di rating effettuati da ciascun utente
ratings_per_user = df_pivot.notna().sum(axis=1)

plt.figure(figsize=(8, 6))
plt.hist(ratings_per_user, bins=len(ratings_per_user.unique()))
plt.title('Distribuzione del numero di rating per utente')
plt.xlabel('Numero di rating')
plt.ylabel('Frequenza')
plt.show()

print(ratings_per_user.describe())

####  Compute the sparsity of the matrix:

In [None]:
num_of_ratings = len(df_ratings)
user_movie_pairs = len(df_ratings)*len(df_movies)
sparsity = 1 - num_of_ratings/user_movie_pairs
sparsity

# Step 3: Handle Missing Values

#### Identify movies and users with the most missing ratings.

In [None]:
# Conta i valori mancanti (NaN) per ogni film
missing_by_movie = df_pivot.isna().sum().sort_values(ascending=False)

# Converte gli ID in titoli usando movie_dict
movie_dict = pd.Series(df_movies.title.values, index=df_movies.movieId).to_dict()
missing_by_movie.index = missing_by_movie.index.map(movie_dict)

# Mostra i film con più rating mancanti
print(missing_by_movie.head(10))

In [None]:
# Conta i valori mancanti per ogni utente
missing_by_user = df_pivot.isna().sum(axis=1).sort_values(ascending=False)

# Mostra gli utenti con più rating mancanti
print(missing_by_user.head(10))

#### Apply simple missing value imputation techniques:
- Fill missing ratings with the movie’s average rating.

In [None]:
# Calcola la media di ogni film (colonne)
movie_avg = df_pivot.mean(axis=0)

# Riempie i valori NaN con la media del film
df_movie_avg_filled = df_pivot.apply(lambda col: col.fillna(movie_avg[col.name]), axis=0)

- Fill missing ratings with the user’s average rating.

In [None]:
# Calcola la media di ogni utente (righe)
user_avg = df_pivot.mean(axis=1)

# Riempie i valori NaN con la media dell'utente
df_user_avg_filled = df_pivot.apply(lambda row: row.fillna(user_avg[row.name]), axis=1)

- Fill missing ratings with the global average rating.

In [None]:
# Calcola la media globale
global_avg = df_pivot.stack().mean() 

# Riempie i NaN con la media globale
df_global_avg_filled = df_pivot.fillna(global_avg)

In [None]:
chosen_df_filled = df_user_avg_filled

# Step 4: Week 2

#### Identify which kind of rating you have in the dataset

In [None]:
# Valori unici dei rating
print("Uniques: ", df_ratings['rating'].unique())

# Statistiche di base
print("\nStats: \t", df_ratings['rating'].describe())

# Distribuzione dei rating
plt.hist(df_ratings['rating'], bins=20)
plt.title('Distribuzione dei rating')
plt.xlabel('Rating')
plt.ylabel('Frequenza')
plt.show()

#### Verify the ratings have a long tail distribution

In [None]:
ratings_per_movie = df_pivot.count()

plt.figure(figsize=(10, 6))

plt.plot(ratings_per_movie.sort_values(ascending=False).values, label='Sorted Movie Ratings')
plt.plot(ratings_per_movie.values, alpha=0.5, label="Movie Ratings")

plt.xlabel("Movies sorted by number of ratings")
plt.ylabel("Number of ratings")
plt.title("Long Tail Distribution of Movie Ratings")
plt.legend()
plt.grid(True)
plt.show()

#### Implement a Neighbourhood Based Collaborative Filtering exploiting a user based approach to identify the neighborhood.

#### 1. Pearson Coefficient

In [None]:
corr_matrix = df_pivot.T.corr(method="pearson")
mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
df_pearson = (
    corr_matrix.where(mask)
    .stack()
    .rename_axis(["user_1", "user_2"])
    .reset_index(name="pearson_corr")
)

display(df_pearson.head(10))
display(df_pearson.describe())


#### 2. Cosine Similarity

In [None]:
# Calcolare la similarità del coseno tra gli utenti
user_similarity = cosine_similarity(chosen_df_filled)
# Creare un DataFrame di similarità
user_similarity_df = pd.DataFrame(user_similarity, index=chosen_df_filled.index, columns=chosen_df_filled.index)
print("Similarità tra utenti:\n", user_similarity_df)

In [None]:
def predict_rating(utente_target, film_target, user_similarity_df=user_similarity_df):
    if utente_target not in chosen_df_filled.index:
        raise ValueError(f"L'utente {utente_target} non esiste nel dataset.")
    if film_target not in chosen_df_filled.columns:
        raise ValueError(f"Il film {film_target} non esiste nel dataset.")

    # Ordinare gli utenti in base alla similarità (escludendo l'utente stesso)
    neighborhood = user_similarity_df[utente_target].drop(utente_target).sort_values(ascending=False)
    print(f"\nVicinato dei 5 utenti di [{utente_target}]:\n", neighborhood[:5])

    # Prendere solo i rating dei vicini per quel film
    ratings_from_neighbors = chosen_df_filled.loc[neighborhood.index, film_target]

    # Ponderare i rating in base alla similarità
    weighted_sum = sum(rating * neighborhood[utente] for utente, rating in ratings_from_neighbors.items())
    similarity_sum = sum(neighborhood[utente] for utente in ratings_from_neighbors.index)

    # Previsione del rating (evitare divisione per 0)
    if similarity_sum != 0:
        predicted_rating = weighted_sum / similarity_sum
    else:
        predicted_rating = np.nan

    print(f"\nRating previsto per utente [{utente_target}] su {movie_dict[film_target]}: {predicted_rating:.2f}")

In [None]:
chosen_user = random.choice(chosen_df_filled.index)
chosen_film = random.choice(chosen_df_filled.columns)

predict_rating(chosen_user, chosen_film, user_similarity_df)

####  Handle the personal bias by weighting the rating data present in the dataset

In [None]:
# Centrare i dati sottraendo la media dei rating per ogni utente
user_means = chosen_df_filled.mean(axis=1)
df_centered = chosen_df_filled.sub(user_means, axis=0)

# Ricalcolare la similarità del coseno sui dati centrati
user_similarity = cosine_similarity(df_centered)
user_similarity_no_bias_df = pd.DataFrame(user_similarity, index=chosen_df_filled.index, columns=chosen_df_filled.index)

In [None]:
predict_rating(chosen_user, chosen_film, user_similarity_no_bias_df)

#### Apply a Discount Factor in the calculation of similarity

In [None]:
# Conteggio degli item in comune tra due utenti
common_items_count = df_pivot.notna().astype(int).dot(df_pivot.notna().astype(int).T)

# Discount factor: più bassa è la sovrapposizione, più forte è lo sconto
min_common_items = 15  # Visto che minimo un utente ha recensito 20 film, scelto 15 come n# film comuni
discount_factor = common_items_count / (common_items_count + min_common_items)

# Applicare il discount factor alla similarità
user_similarity_discounted = user_similarity_no_bias_df * discount_factor
user_similarity_discounted_df = pd.DataFrame(user_similarity_discounted, index=chosen_df_filled.index, columns=chosen_df_filled.index)

In [None]:
predict_rating(chosen_user, chosen_film, user_similarity_discounted_df)

####  Integrate a strategy to handle the Long Tail Problem

In [None]:
# 1. Calcolare il numero di utenti che hanno visto ogni film (non-NaN)
num_users_per_movie = df_pivot.notna().sum()

# 2. Calcolare l'Inverse User Frequency (IUF) per ciascun film
total_users = df_pivot.shape[0]  # Numero totale di utenti (righe)
iuf = np.log(total_users / num_users_per_movie)

# 3. Creare una matrice IUF per ogni film (con le colonne in IUF)
iuf_matrix = df_pivot.copy()  # Copia del dataframe per aggiungere l'IUF
for movie in df_pivot.columns:
    iuf_matrix[movie] = df_pivot[movie] * iuf[movie]  # Ponderare il rating per l'IUF

print(iuf_matrix.info(), "\n")
print(iuf_matrix.stack().describe())

In [None]:
# Ricalcolo similarity con IUF
iuf_matrix_filled = iuf_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
cos_iuf_matrix = cosine_similarity(iuf_matrix_filled)

# Creare un DataFrame per visualizzare la matrice di similarità
iuf_similarity_df = pd.DataFrame(cos_iuf_matrix, index=iuf_matrix.index, columns=iuf_matrix.index)

# Visualizzare la matrice di similarità
print(iuf_similarity_df)

In [None]:
predict_rating(chosen_user, chosen_film, iuf_similarity_df)

#  User-Based Nearest Neighbour Regression Collaborative Filtering

In [None]:
# Function to retrieve top-k neighbors for a given user
def get_top_k_neighbors(u, similarity_df, k=10):
    """
    Retrieves the top-k most similar users for a given user.

    Parameters:
    - u (int): The ID of the target user for whom we want to find similar users.
    - similarity_df (pd.DataFrame): DataFrame containing pairwise user similarity scores.
    - k (int, optional): Number of top neighbors to return (default is 10).

    Returns:
    - pd.Index: Index of the top-k most similar users.
    """
    # Extract the similarity row corresponding to user 'u', excluding the user itself
    sim_row = similarity_df.loc[u].drop(u, errors='ignore')
    
    # Sort similarities in descending order and select the top-k users
    top_k = sim_row.nlargest(k).index  
    return top_k


In [None]:
def train_regression_for_user(u, df_pivot, user_avg, neighbors, min_common=2, verbose=False):
    """
    Trains a linear regression model for a given user to predict their ratings 
    based on their neighbors' ratings.

    Returns:
      - w_{vu} (np.ndarray): The vector of regression weights for the user's neighbors.
      - mse (float): The mean squared error (MSE) of the model on the training set.

    Regression equation:
        (r_{uj} - mu_u) = sum_v [ w_{vu} * (r_{vj} - mu_v) ]
    for all items j that user u has rated.

    Parameters:
    - u (int): The target user ID for whom the regression model is being trained.
    - df_pivot (pd.DataFrame): Pivot table of user-item ratings with NaN for missing values.
    - user_avg (pd.Series): Series containing the average rating for each user.
    - neighbors (list): List of neighbor user IDs to be used as features in the regression model.
    - min_common (int, optional): Minimum number of rated items required to train the model (default = 2).
    - verbose (bool, optional): If True, prints additional information about the training process (default = False).

    Returns:
    - np.ndarray: Array of learned weights (one for each neighbor).
    - float: Mean squared error (MSE) on the training set. Returns np.nan if there are insufficient training points.
    """

    # List of items rated by user u
    user_ratings = df_pivot.loc[u]
    items_rated_by_u = user_ratings[user_ratings.notna()].index

    # Target vector Y (centered ratings) and feature matrix X
    X = []  # Each row corresponds to a rated item with features from neighbors
    Y = []  # Each element corresponds to the target value for that item

    # Mean rating of user u
    mu_u = user_avg[u]

    # Build X (neighbor features) and Y (target) for each rated item
    for item_j in items_rated_by_u:
        r_uj = df_pivot.loc[u, item_j]  # User u's rating for item j
        y_j = r_uj - mu_u               # Centered rating (target)
        
        row_features = []
        # Build features for each neighbor's centered rating
        for v in neighbors:
            r_vj = df_pivot.loc[v, item_j]  # Neighbor v's rating for item j
            if pd.isna(r_vj):  
                # If the neighbor hasn't rated item j, use 0 (or fallback to their mean)
                row_features.append(0.0)
            else:
                mu_v = user_avg[v]  # Mean rating of neighbor v
                row_features.append(r_vj - mu_v)

        X.append(row_features)
        Y.append(y_j)

    # Convert to numpy arrays for regression
    X = np.array(X)
    Y = np.array(Y)

    # If there are too few data points, return zero weights and np.nan MSE
    if len(Y) < min_common:
        if verbose:
            print(f"[User {u}] Too few items for regression: {len(Y)} < {min_common}")
        return np.zeros(len(neighbors)), np.nan

    # Train a linear regression model without an intercept
    model = LinearRegression(fit_intercept=False)
    model.fit(X, Y)

    # Calculate MSE on the training set
    Y_pred = model.predict(X)
    mse = np.mean((Y_pred - Y)**2)

    if verbose:
        print(f"[User {u}] MSE: {mse:.4f}")

    return model.coef_, mse

In [None]:
user_regression_models = {}  # Dictionary to store regression models for each user
all_mse = []  # List to track MSE values across all users

# Number of top similar users to retain
k = 40

# Iterate over all users in the pivot table
for u in df_pivot.index:
    # Identify the top k neighbors for user u
    neighbors_u = get_top_k_neighbors(u, user_similarity_df, k)
    
    # Train the regression model for user u
    w_u, mse_u = train_regression_for_user(
        u=u,                      # Current user ID
        df_pivot=df_pivot,        # Pivot table with user-item ratings
        user_avg=user_avg,        # Series with average user ratings
        neighbors=neighbors_u,    # Identified top k neighbors
        min_common=2,             # Minimum items in common for valid training
        verbose=True              # Display MSE for each user during training
    )
    
    # Store the trained model's results
    user_regression_models[u] = {
        'neighbors': neighbors_u,  # List of neighbor IDs
        'weights': w_u             # Corresponding regression weights
    }

    # Collect the MSE value for performance analysis
    all_mse.append(mse_u)

# Compute and display the overall average MSE (ignoring NaN values)
all_mse = [m for m in all_mse if not np.isnan(m)]
if all_mse:
    print(f"\nAverage MSE across all users: {np.mean(all_mse):.4f}")
else:
    print("No MSE available (too many users with too few ratings?).")

![](images/img.png)

![](images/img_2.png)

In [None]:
def predict_rating_regression(u, j, df_pivot, user_avg, user_regression_models):
    """
    Predicts the rating of user u for item j using the regression model's learned weights.

    Parameters:
    - u (int): ID of the user for whom the rating is predicted.
    - j (int): ID of the item (movie) to predict the rating for.
    - df_pivot (pd.DataFrame): Pivot table with users as rows, items as columns, and values as ratings.
    - user_avg (pd.Series): Mean rating for each user.
    - user_regression_models (dict): Dictionary containing for each user:
        - 'neighbors': List of neighbors.
        - 'weights': Vector of learned regression weights.

    Returns:
    - float: The predicted rating for user u on item j, or np.nan if the model is unavailable.
    """

    # If the user has no regression model, return np.nan
    if u not in user_regression_models:
        return np.nan

    # Retrieve neighbors and learned weights
    neighbors = user_regression_models[u]['neighbors']
    weights = user_regression_models[u]['weights']
    mu_u = user_avg[u]  # Mean rating of user u

    # Build the weighted contribution sum from neighbors
    contribution_sum = 0.0
    for w_vu, v in zip(weights, neighbors):
        # Get neighbor v's rating for item j
        r_vj = df_pivot.loc[v, j]
        # If neighbor v hasn't rated item j, use their mean rating as a fallback
        if pd.isna(r_vj):
            r_vj = user_avg[v]

        mu_v = user_avg[v]  # Mean rating of neighbor v
        contribution_sum += w_vu * (r_vj - mu_v)  # Weighted contribution

    # Return the predicted rating using the formula: r_{uj} = mu_u + sum_v [w_{vu} * (r_{vj} - mu_v)]
    return mu_u + contribution_sum

In [None]:
# Predict the rating for the chosen user and item
pred = predict_rating_regression(chosen_user, chosen_film, df_pivot, user_avg, user_regression_models)
print(f"Predicted rating for user [{chosen_user}] on item {movie_dict[chosen_film]}: {pred:.2f}")