Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')


 Load and Preview the Dataset

In [2]:
# Load the dataset
df = pd.read_csv("C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/data/Final_data/Final_data.csv")

# Preview the first few rows
print("Data preview:")
print(df.head())

# Check the shape of the dataset
print("\nDataset shape:", df.shape)


Data preview:
   UserID  MovieID  Rating                 Title
0       1      122     5.0      Boomerang (1992)
1       1      185     5.0       Net, The (1995)
2       1      231     5.0  Dumb & Dumber (1994)
3       1      292     5.0       Outbreak (1995)
4       1      316     5.0       Stargate (1994)

Dataset shape: (10000054, 4)


Prepare the Item-User Matrix and Compute Item Similarity

In [3]:
item_user_matrix = df.pivot_table(index='MovieID', columns='UserID', values='Rating').fillna(0)

# Compute the cosine similarity matrix
item_similarity = cosine_similarity(item_user_matrix)

# Create a DataFrame for item similarity
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

# Define the path for saving the models
save_path = r'C:\Users\anujp\OneDrive\Desktop\MovieRecommendations\models\collaborative_filtering\item_based'

# Save the item similarity matrix
with open(f'{save_path}\\item_similarity.pkl', 'wb') as f:
    pickle.dump(item_similarity_df, f)

# Create and save the movie ID to title mapping
movie_id_to_title = pd.Series(df['Title'].values, index=df['MovieID']).to_dict()
with open(f'{save_path}\\movie_id_to_title.pkl', 'wb') as f:
    pickle.dump(movie_id_to_title, f)

print("Item similarity matrix and movie mapping saved successfully.")

Item similarity matrix and movie mapping saved successfully.


Split the Data into Training and Testing Sets

In [4]:
# Create user-item matrix for the entire dataset
user_item_matrix = df.pivot_table(index='UserID', columns='MovieID', values='Rating')

# Split the data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create user-item matrices for train and test data
train_user_item_matrix = train_data.pivot_table(index='UserID', columns='MovieID', values='Rating')
test_user_item_matrix = test_data.pivot_table(index='UserID', columns='MovieID', values='Rating')

print("Data has been split into training and testing sets.")


Data has been split into training and testing sets.


Define Functions for Making Predictions

In [5]:
def predict_rating(user_id, item_id, train_user_item_matrix, item_similarity_df):
    """
    Predict the rating of a user for a given item using item-based collaborative filtering.
    """
    if item_id not in item_similarity_df.index or user_id not in train_user_item_matrix.index:
        return np.nan
    
    # Get the similarity scores for the item
    sim_scores = item_similarity_df[item_id]
    
    # Get the user's ratings for similar items
    user_ratings = train_user_item_matrix.loc[user_id]
    
    # Only consider items the user has rated
    user_ratings = user_ratings.dropna()
    
    # Align the indices
    sim_scores = sim_scores[user_ratings.index]
    
    # Compute the weighted sum of ratings
    numerator = np.dot(sim_scores, user_ratings)
    denominator = sim_scores.sum()
    
    if denominator == 0:
        return np.nan
    else:
        return numerator / denominator

def predict_ratings(test_data, train_user_item_matrix, item_similarity_df):
    """
    Predict ratings for all entries in the test set.
    """
    predictions = []
    actuals = []
    
    for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
        user_id = row['UserID']
        item_id = row['MovieID']
        actual_rating = row['Rating']
        
        predicted_rating = predict_rating(user_id, item_id, train_user_item_matrix, item_similarity_df)
        
        if not np.isnan(predicted_rating):
            predictions.append(predicted_rating)
            actuals.append(actual_rating)
    
    return actuals, predictions


 Define Evaluation Functions for MAE and RMSE

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def compute_mae_rmse(actuals, predictions):
    """
    Compute Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE).
    """
    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    return mae, rmse


 Evaluate the Model Using MAE and RMSE

In [7]:
# Predict ratings for the test set
actuals, predictions = predict_ratings(test_data, train_user_item_matrix, item_similarity_df)

# Compute MAE and RMSE
mae, rmse = compute_mae_rmse(actuals, predictions)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


100%|██████████| 2000011/2000011 [12:48<00:00, 2602.62it/s]


Mean Absolute Error (MAE): 0.7376
Root Mean Squared Error (RMSE): 0.9460


Compute Precision@K and Recall@K

In [8]:
# Cell 8: Compute Precision@K and Recall@K

def precision_recall_at_k(test_data, train_user_item_matrix, item_similarity_df, k=5, threshold=3.5):
    """
    Compute Precision@K and Recall@K for the test set.
    """
    user_precision = []
    user_recall = []
    
    # Get unique users in the test set
    test_users = test_data['UserID'].unique()
    
    for user_id in tqdm(test_users):
        # Get actual and predicted ratings for the user
        user_test_data = test_data[test_data['UserID'] == user_id]
        actual_ratings = user_test_data.set_index('MovieID')['Rating']
        
        # Predict ratings for all items
        user_predicted_ratings = {}
        for item_id in actual_ratings.index:
            predicted_rating = predict_rating(user_id, item_id, train_user_item_matrix, item_similarity_df)
            if not np.isnan(predicted_rating):
                user_predicted_ratings[item_id] = predicted_rating
        
        if not user_predicted_ratings:
            continue
        
        # Get top K recommendations
        top_k_items = sorted(user_predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:k]
        recommended_items = set([item for item, rating in top_k_items])
        
        # Relevant items (actual ratings above threshold)
        relevant_items = set(actual_ratings[actual_ratings >= threshold].index)
        
        # True positives (relevant items recommended)
        tp = recommended_items & relevant_items
        
        # Compute precision and recall
        precision = len(tp) / len(recommended_items) if recommended_items else 0
        recall = len(tp) / len(relevant_items) if relevant_items else 0
        
        user_precision.append(precision)
        user_recall.append(recall)
    
    # Compute average precision and recall
    avg_precision = np.mean(user_precision)
    avg_recall = np.mean(user_recall)
    
    return avg_precision, avg_recall

# Compute Precision@K and Recall@K
precision, recall = precision_recall_at_k(test_data, train_user_item_matrix, item_similarity_df, k=5, threshold=3.5)

print(f"Precision@5: {precision:.4f}")
print(f"Recall@5: {recall:.4f}")


100%|██████████| 69796/69796 [11:39<00:00, 99.76it/s] 

Precision@5: 0.7508
Recall@5: 0.5019





Compute Normalized Discounted Cumulative Gain (NDCG)

In [9]:
# Cell 9: Compute Normalized Discounted Cumulative Gain (NDCG)

def ndcg_at_k(test_data, train_user_item_matrix, item_similarity_df, k=5):
    """
    Compute NDCG@K for the test set.
    """
    ndcg_scores = []
    
    # Get unique users in the test set
    test_users = test_data['UserID'].unique()
    
    for user_id in tqdm(test_users):
        # Get actual ratings for the user
        user_test_data = test_data[test_data['UserID'] == user_id]
        actual_ratings = user_test_data.set_index('MovieID')['Rating']
        
        # Predict ratings for all items
        user_predicted_ratings = {}
        for item_id in actual_ratings.index:
            predicted_rating = predict_rating(user_id, item_id, train_user_item_matrix, item_similarity_df)
            if not np.isnan(predicted_rating):
                user_predicted_ratings[item_id] = predicted_rating
        
        if not user_predicted_ratings:
            continue
        
        # Sort predicted ratings
        sorted_items = sorted(user_predicted_ratings.items(), key=lambda x: x[1], reverse=True)
        
        # Get top K items
        top_k_items = sorted_items[:k]
        
        # Compute DCG
        dcg = 0.0
        for i, (item_id, pred_rating) in enumerate(top_k_items):
            actual_rating = actual_ratings.get(item_id, 0)
            gain = (2 ** actual_rating - 1) / np.log2(i + 2)
            dcg += gain
        
        # Compute IDCG
        ideal_ratings = sorted(actual_ratings.values, reverse=True)[:k]
        idcg = 0.0
        for i, rating in enumerate(ideal_ratings):
            gain = (2 ** rating - 1) / np.log2(i + 2)
            idcg += gain
        
        if idcg == 0:
            ndcg = 0
        else:
            ndcg = dcg / idcg
        
        ndcg_scores.append(ndcg)
    
    # Compute average NDCG
    avg_ndcg = np.mean(ndcg_scores)
    
    return avg_ndcg

# Compute NDCG@K
ndcg = ndcg_at_k(test_data, train_user_item_matrix, item_similarity_df, k=5)

print(f"NDCG@5: {ndcg:.4f}")


100%|██████████| 69796/69796 [11:15<00:00, 103.35it/s]

NDCG@5: 0.7813





 Summarize Evaluation Results

In [10]:
print("Evaluation Metrics Summary:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Precision@5: {precision:.4f}")
print(f"Recall@5: {recall:.4f}")
print(f"NDCG@5: {ndcg:.4f}")


Evaluation Metrics Summary:
Mean Absolute Error (MAE): 0.7376
Root Mean Squared Error (RMSE): 0.9460
Precision@5: 0.7508
Recall@5: 0.5019
NDCG@5: 0.7813
