In [1]:
!pip install scikit-learn




In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [6]:
rating_df = pd.read_csv('reduced.csv', parse_dates=['timestamp'])

In [7]:
rand_users = np.random.choice(rating_df['userId'].unique(),
                              size=int(len(rating_df['userId'].unique())*0.08),
                              replace=False)

rating_df = rating_df.loc[rating_df['userId'].isin(rand_users)]

print('Reduced dataframe: {} rows for {} different users'.format(len(rating_df), len(rand_users)))

Reduced dataframe: 645055 rows for 4431 different users


In [8]:
rating_df.shape

(645055, 5)

In [11]:
# Pivot the DataFrame to create a user-item interaction matrix
interaction_matrix = rating_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Convert the interaction matrix to a sparse matrix to save memory
sparse_matrix = csr_matrix(interaction_matrix.values)

# Create a mapping from matrix indices to actual IDs
user_ids = interaction_matrix.index.tolist()
movie_ids = interaction_matrix.columns.tolist()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(rating_df, test_size=0.2, random_state=42)

# Create the training interaction matrix
train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
sparse_train_matrix = csr_matrix(train_matrix.values)

In [12]:
# Compute the cosine similarity matrix between users
user_similarity = cosine_similarity(sparse_train_matrix, dense_output=True)

# Convert the similarity matrix to a DataFrame
user_similarity = pd.DataFrame(user_similarity, index=user_ids, columns=user_ids)

In [15]:
def predict_ratings(user_similarity, train_matrix):
    train_matrix_np = train_matrix.to_numpy()

    # subtract the mean rating for each user
    mean_user_rating = train_matrix.mean(axis=1).to_numpy().reshape(-1, 1)
    ratings_diff = train_matrix_np - mean_user_rating

    user_similarity_np = user_similarity.to_numpy()

    # Predict ratings
    pred = mean_user_rating + user_similarity_np.dot(ratings_diff) / np.array([np.abs(user_similarity_np).sum(axis=1)]).T

    pred_df = pd.DataFrame(pred, index=user_ids, columns=train_matrix.columns)

    # Reindex the predictions to match the original item indices
    pred_df = pred_df.reindex(columns=movie_ids, fill_value=0)

    return pred_df


In [16]:
predicted_ratings = predict_ratings(user_similarity, train_matrix)

In [17]:
# calculate Hit Ratio@10
def hit_ratio_at_10(test_data, predicted_ratings, top_k=10):
    hits = 0
    total = 0

    for _, row in test_data.iterrows():
        user = row['userId']
        true_item = row['movieId']

        if user in predicted_ratings.index and true_item in predicted_ratings.columns:
            predicted_items = predicted_ratings.loc[user].sort_values(ascending=False).head(top_k).index.tolist()

            if true_item in predicted_items:
                hits += 1
            total += 1

    return hits / total if total > 0 else 0

In [18]:
# Evaluate the model
hit_ratio = hit_ratio_at_10(test_data, predicted_ratings, top_k=10)
print(f'Hit Ratio@10 for User-based CF Model: {hit_ratio:.2f}')

Hit Ratio@10 for User-based CF Model: 0.03
