## Notebook2: User-based Collaborative Filtering Movie Recommendation System

In [21]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity


In [22]:
# Set up the environment path for the dataset
os.environ['BASE_PATH'] = '/l/users/chaimaa.abi/BigData2/BigData/Dataset/ml-25m/ml-25m'

In [23]:
# Function to fetch data correctly handling headers
def fetch_data(file_name, nrows=None):
    base_path = os.getenv('BASE_PATH')  # Get the base path from the environment variables
    full_data_path = os.path.join(base_path, file_name)  # Create the full path by joining the base path with the file name
    return pd.read_csv(full_data_path, nrows=nrows)  # Read the CSV file located at the full path, optionally reading only a certain number of rows specified by 'nrows'


The following function computes similarities between movies based on user rating patterns. The similarities can be used by a recommendation system to suggest new movies to users.

It takes a matrix of user ratings for movies and computes one of three similarity measures between movie pairs:

1. Pearson Correlation: Linear correlation of rating patterns.
2. Cosine Similarity: Angle between rating vectors. 
3. Jaccard Similarity: Overlap of users who rated each movie.


In [24]:
# Compute different similarities
def compute_similarities(matrix, method='pearson'):
    # Check if the matrix is empty or entirely NaN
    if matrix.empty or matrix.isna().all().all():
        print("Warning: The matrix is empty or entirely NaN.")
        return pd.DataFrame()  # Return an empty DataFrame as a safe fallback

    # Pearson correlation
    if method == 'pearson':
        return matrix.T.corr()
    # Cosine similarity
    elif method == 'cosine':
        # Normalize the matrix by subtracting the mean of each row
        normalized_matrix = matrix.sub(matrix.mean(axis=1), axis=0).fillna(0)
        if normalized_matrix.empty:
            print("Warning: Normalized matrix is empty.")
            return pd.DataFrame()  # Return an empty DataFrame if the normalized matrix is empty
        # Compute cosine similarity between rows
        return pd.DataFrame(cosine_similarity(normalized_matrix), index=matrix.index, columns=matrix.index)
    # Jaccard similarity
    elif method == 'jaccard':
        # Convert the matrix to binary (1 for non-NaN values, 0 for NaN values)
        binary_matrix = matrix.notna().astype(int)
        # Compute intersection of binary matrix
        intersection = np.dot(binary_matrix, binary_matrix.T)
        # Compute sum of rows
        row_sums = binary_matrix.sum(axis=1).values
        # Compute union of binary matrix
        union = row_sums.reshape(-1, 1) + row_sums - intersection
        # Compute Jaccard similarity between rows
        jaccard = np.where(union == 0, 0, intersection / union)
        return pd.DataFrame(jaccard, index=matrix.index, columns=matrix.index)

In [26]:
def compute_rmse(data, user_matrix, norm_matrix, sim_matrix):
    estimates = []
    for _, row in data.iterrows():
        pred_rating = predict_rating(row['userId'], row['movieId'], user_matrix, norm_matrix, sim_matrix)
        if not np.isnan(pred_rating):  # Ensure the prediction is valid
            estimates.append((pred_rating, row['rating']))

    # Calculate RMSE only on estimated ratings
    if estimates:
        predicted, actual = zip(*estimates)  # This separates predicted and actual ratings into separate tuples
        return np.sqrt(mean_squared_error(actual, predicted))
    else:
        return None  # or a sensible default/error message

# Revising the prediction function to handle edge cases better:
def predict_rating(user, item, user_matrix, norm_matrix, sim_matrix):
    if item not in user_matrix.columns:
        return np.nan  # Use NaN for items that have no ratings, so they don't affect RMSE

    relevant_users = norm_matrix[item].dropna()
    weights = sim_matrix.loc[user].reindex(relevant_users.index).dropna()

    if weights.empty:
        return np.nan  # Return NaN if no similar users to avoid affecting RMSE

    weighted_sum = (relevant_users * weights).sum()
    scale = abs(weights).sum()
    base = user_matrix.loc[user].mean()

    return base + (weighted_sum / scale if scale else 0)


In [27]:
# Generate recommendations including movie names
def generate_recommendations(user_id, user_item_matrix, similarity_matrix, movies_df, num_recommendations=10):
    # Get the list of unrated items for the user
    unrated_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isna()].index
    # Create a dictionary to store predicted ratings for unrated items
    predictions = {item: predict_rating(user_id, item, user_item_matrix, user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0).fillna(0), similarity_matrix) for item in unrated_items}
    # Sort the predictions in descending order of predicted rating
    recommended_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
    # Get the movie details for the recommended items
    recommended_movies = movies_df[movies_df['movieId'].isin([item[0] for item in recommended_items])]
    # Add the predicted ratings to the recommended movies DataFrame
    recommended_movies['predicted_rating'] = [item[1] for item in recommended_items if item[0] in recommended_movies['movieId'].values]
    # Return the recommended movies DataFrame with movie titles and predicted ratings
    return recommended_movies[['title', 'predicted_rating']]

In [28]:
def simulate_user(user_matrix, movies_df, user_id=99999):
    """ Simulate a new user with specific ratings. """

    # Example movies and ratings
    simulated_ratings = {
        1: 5.0,  # Toy Story
        6377: 4.5,  # Finding Nemo
        6991: 5.0  # Tarzan
    }

    # Create a new row for the user
    new_user_row = pd.Series(
        data=[simulated_ratings.get(movie_id, np.nan) for movie_id in user_matrix.columns],
        index=user_matrix.columns
    )

    # Append the row to the user matrix
    user_matrix.loc[user_id] = new_user_row

    return user_matrix

In [29]:
def main():
    # Fetch data from CSV files
    ratings = fetch_data('ratings.csv')
    movies = fetch_data('movies.csv')

    # Create a user-item matrix and normalize it
    user_matrix = ratings.pivot_table(values='rating', index='userId', columns='movieId', aggfunc='mean').fillna(0)
    norm_matrix = user_matrix.subtract(user_matrix.mean(axis=1), axis=0)
    print("User Matrix and Norm Matrix Created.")

    # Compute similarity matrices
    pearson_sim = compute_similarities(user_matrix, 'pearson')
    cosine_sim = compute_similarities(user_matrix, 'cosine')
    jaccard_sim = compute_similarities(user_matrix, 'jaccard')
    print("Similarity Matrices Computed.")

    # Compute RMSE for different similarity measures
    rmse_pearson = compute_rmse(ratings, user_matrix, norm_matrix, pearson_sim)
    rmse_cosine = compute_rmse(ratings, user_matrix, norm_matrix, cosine_sim)
    rmse_jaccard = compute_rmse(ratings, user_matrix, norm_matrix, jaccard_sim)
    print(f"RMSE for Pearson Correlation: {rmse_pearson}")
    print(f"RMSE for Cosine Similarity: {rmse_cosine}")
    print(f"RMSE for Jaccard Similarity: {rmse_jaccard}")

    # Simulate a new user
    user_id = 99999  # A unique identifier for the simulated user
    user_matrix = simulate_user(user_matrix, movies, user_id)

    # Recompute the normalized matrix and similarities
    norm_matrix = user_matrix.subtract(user_matrix.mean(axis=1), axis=0)
    pearson_sim = compute_similarities(user_matrix, 'pearson')

    # Generate recommendations for the simulated user
    recommendations = generate_recommendations(user_id, user_matrix, pearson_sim, movies, num_recommendations=10)
    print(f"Recommendations for simulated user {user_id}:")
    if recommendations.empty:
        print("No recommendations available.")
    else:
        print(recommendations)

In [30]:
if __name__ == "__main__":
    main()

User Matrix and Norm Matrix Created.
Similarity Matrices Computed.
RMSE for Pearson Correlation: 0.7075622590235725
RMSE for Cosine Similarity: 0.7075622590235742
RMSE for Jaccard Similarity: 1.8958430031549052
Recommendations for simulated user 99999:
                                                  title  predicted_rating
108                                   Braveheart (1995)           6.26285
257           Star Wars: Episode IV - A New Hope (1977)           6.26285
314                    Shawshank Redemption, The (1994)           6.26285
328                                    Tommy Boy (1995)           6.26285
452                                Fugitive, The (1993)           6.26285
522                             Schindler's List (1993)           6.26285
1108             Monty Python and the Holy Grail (1975)           6.26285
1166  Star Wars: Episode V - The Empire Strikes Back...           6.26285
1167                         Princess Bride, The (1987)           6.26285
1179  S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_movies['predicted_rating'] = [item[1] for item in recommended_items if item[0] in recommended_movies['movieId'].values]
