In [6]:
import pandas as pd
from surprise import Dataset, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate
import random
import numpy as np
import os # Needed to locate the MovieLens file

# --- 0. HELPER FUNCTION TO GET MOVIE TITLES ---

def get_movie_titles():
    """
    Reads the u.item file from the MovieLens 100k dataset directory
    and returns a dictionary mapping Item ID to Movie Title.
    """
    # The 'ml-100k' dataset is usually stored in the Surprise data folder
    data_dir = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/'
    file_name = os.path.join(data_dir, 'u.item')

    # Check if the file exists (it will exist after Dataset.load_builtin('ml-100k'))
    if not os.path.exists(file_name):
        print("Error: u.item file not found. Ensure Dataset.load_builtin('ml-100k') runs first.")
        return None

    # Load the item file. It's tab-separated and uses ISO-8859-1 encoding.
    rid_to_name = {}

    with open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            try:
                # The format is ItemID|Title|Release Date|...
                parts = line.split('|')
                item_id = parts[0]
                title = parts[1]
                rid_to_name[item_id] = title
            except:
                # Skip any malformed lines
                continue

    return rid_to_name

# --- 1. SET UP AND DATA LOADING ---

# Set a random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("--- Step 1: Data Preparation (Using MovieLens 100k) ---")

# Load the standard MovieLens 100k dataset. This step downloads the files.
data = Dataset.load_builtin('ml-100k')
rid_to_name = get_movie_titles()
if rid_to_name is None:
    # If the file couldn't be loaded, we must stop
    print("Cannot proceed without movie titles. Exiting.")
    exit()

print(f"MovieLens 100k dataset and {len(rid_to_name)} movie titles loaded successfully.")

# --- 2. MODEL TRAINING AND EVALUATION ---

print("\n--- Step 2: Model Training and Evaluation ---")

# Split data for performance evaluation
trainset, testset = train_test_split(data, test_size=0.25, random_state=RANDOM_SEED)

# Initialize the SVD algorithm (Matrix Factorization)
algo = SVD(n_factors=100, n_epochs=20, random_state=RANDOM_SEED)

# Train the algorithm
algo.fit(trainset)
print("SVD Model trained successfully on the training set.")

# Make predictions and evaluate (Deliverable: Evaluation Metrics)
predictions = algo.test(testset)
print(f"\nModel Performance Metrics on Test Set:")
accuracy.rmse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)

# --- 3. GENERATING RECOMMENDATIONS ---

print("\n--- Step 3: Generating Recommendations ---")

# Retrain the model on the full dataset for best prediction accuracy
full_trainset = data.build_full_trainset()
full_algo = SVD(n_factors=100, n_epochs=20, random_state=RANDOM_SEED).fit(full_trainset)


# --- CORRECTED RECOMMENDATION FUNCTION ---
def get_top_n_recommendations(user_id, all_items, trainset, model, n=5):
    """
    Returns the top N item recommendations for a given user.
    """

    # 1. Get the user's inner ID
    try:
        user_inner_id = trainset.to_inner_uid(user_id)
    except ValueError:
        print(f"User ID {user_id} not found in the dataset.")
        return []

    # 2. Get the inner IDs of items the user has already rated
    # trainset.ur[user_inner_id] returns a list of tuples: [(item_inner_id, rating), ...]
    rated_inner_items_tuples = trainset.ur[user_inner_id]

    # Extract the raw item IDs from the tuples
    rated_raw_items = [trainset.to_raw_iid(inner_iid) for (inner_iid, _) in rated_inner_items_tuples]

    # 3. Filter for unrated items
    unrated_raw_items = [item for item in all_items if item not in rated_raw_items]

    # 4. Predict ratings for all unrated items
    predictions = [model.predict(user_id, raw_iid) for raw_iid in unrated_raw_items]

    # 5. Sort predictions by estimated rating (highest first)
    predictions.sort(key=lambda x: x.est, reverse=True)

    # 6. Get the top N (raw_item_id, predicted_rating)
    top_n = [(pred.iid, pred.est) for pred in predictions[:n]]

    return top_n
# ----------------------------------------


# Get all unique raw item IDs from the full trainset
all_raw_items = list(full_trainset.all_items())
all_raw_iids = [full_trainset.to_raw_iid(inner_id) for inner_id in all_raw_items]

# Choose a target user ID (e.g., User 196 from the ml-100k dataset)
target_user_id = '196'
n_recommendations = 5

# Get recommendations
recommendations = get_top_n_recommendations(target_user_id, all_raw_iids, full_trainset, full_algo, n=n_recommendations)

# Display Recommendation Results (Deliverable: Recommendation Results + Movie Titles)
print(f"\nTop {n_recommendations} Recommendations for User ID {target_user_id}:")
print("-------------------------------------------------------")

for rank, (item_id, rating) in enumerate(recommendations, 1):
    movie_title = rid_to_name.get(item_id, 'Title Not Found')
    print(f"  {rank}. Movie: {movie_title}")
    print(f"     Item ID: {item_id}, Predicted Rating: {rating:.4f}")

--- Step 1: Data Preparation (Using MovieLens 100k) ---
MovieLens 100k dataset and 1682 movie titles loaded successfully.

--- Step 2: Model Training and Evaluation ---
SVD Model trained successfully on the training set.

Model Performance Metrics on Test Set:
RMSE: 0.9396
MAE:  0.7408

--- Step 3: Generating Recommendations ---

Top 5 Recommendations for User ID 196:
-------------------------------------------------------
  1. Movie: Schindler's List (1993)
     Item ID: 318, Predicted Rating: 4.6737
  2. Movie: Pather Panchali (1955)
     Item ID: 1449, Predicted Rating: 4.5499
  3. Movie: One Flew Over the Cuckoo's Nest (1975)
     Item ID: 357, Predicted Rating: 4.5348
  4. Movie: Usual Suspects, The (1995)
     Item ID: 12, Predicted Rating: 4.5185
  5. Movie: Silence of the Lambs, The (1991)
     Item ID: 98, Predicted Rating: 4.5166
