In [None]:
# Run this in your first Colab cell to get the data
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip


In [None]:
!pip install scikit-surprise


In [None]:
import pandas as pd
import numpy as np
ratings=pd.read_csv('ml-latest-small/ratings.csv')
display(ratings[['rating']].describe())
print(ratings.info())


In [None]:
display(ratings.head(10))

In [None]:
import pandas as pd

# Convert timestamp to a readable datetime format
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# Now you can see the actual date and time
display(ratings[['userId', 'movieId', 'rating', 'timestamp']].head(12))

In [None]:
movies=pd.read_csv('ml-latest-small/movies.csv')
display(movies.head(20))

In [None]:
# 1. First, make sure your timestamps are converted (as we discussed)
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# 2. Sort the data by User and Time
# This ensures the 'last' rows for each user are the most recent ones
ratings = ratings.sort_values(by=['userId', 'timestamp'])

# 3. Define N (the number of ratings to hide from the model per user)
# N=5 is a common choice for this dataset size
N = 5

# 4. Create the Test Set (Last N ratings for every user)
test_set = ratings.groupby('userId').tail(N)

# 5. Create the Train Set (Everything else)
train_set = ratings.drop(test_set.index)

print(f"Training set: {len(train_set)} rows")
print(f"Testing set: {len(test_set)} rows")

In [None]:
# Replace pipes with spaces for a cleaner 'ReelSense' output
movies['genres_clean'] = movies['genres'].str.replace('|', ' ', regex=False)
display(movies[['title', 'genres_clean']].head())

In [None]:
# Create a separate column for every single genre (One-Hot Encoding)
genres_split = movies['genres'].str.get_dummies(sep='|')

# Combine it back with your movies dataframe
movies_encoded = pd.concat([movies, genres_split], axis=1)

# Now you can see if a movie is 'Action' by checking the Action column
display(movies_encoded[['title', 'Action', 'Sci-Fi', 'Drama']].head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=train_set, palette='viridis')
plt.title('Distribution of User Ratings (Train Set)')
plt.show()

In [None]:
# Split genres and count frequency
genres_df = movies['genres'].str.get_dummies(sep='|')
genre_counts = genres_df.sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
genre_counts.plot(kind='bar', color='skyblue')
plt.title('Most Popular Movie Genres')
plt.ylabel('Count')
plt.show()

In [None]:
# Force-install the last stable version of NumPy 1.x
!pip install "numpy<2" scikit-surprise --force-reinstall

In [None]:
!pip install scikit-surprise
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

# Load data into Surprise format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_set[['userId', 'movieId', 'rating']], reader)
trainset_surprise = data.build_full_trainset()

# Train the SVD model (Matrix Factorization)
svd_model = SVD(n_factors=100, random_state=42)
svd_model.fit(trainset_surprise)

In [None]:
from surprise import SVD, Dataset, Reader
print("Surprise imported successfully!")

In [None]:
# 1. Merge the training ratings with the encoded movie genres
user_genre_matrix = pd.merge(train_set, movies_encoded.drop(['title', 'genres'], axis=1), on='movieId')

# 2. Get the list of genre columns only (ignoring userId, movieId, etc.)
genre_cols = genres_split.columns

# 3. Weight the genres by the rating
# If a user gave a 5.0 to an Action movie, that Action column becomes 5.0 for that row
for genre in genre_cols:
    user_genre_matrix[genre] = user_genre_matrix[genre] * user_genre_matrix['rating']

# 4. Group by User to get their final "Taste Profile"
user_profiles = user_genre_matrix.groupby('userId')[genre_cols].sum()

# 5. Normalize (Optional but recommended): Scale profiles so they sum to 1
user_profiles = user_profiles.div(user_profiles.sum(axis=1), axis=0)

print("User Profiles Created! Sample for User 1:")
display(user_profiles.head(1))

In [None]:
# STEP 4: Creating the profiles
# We group by userId and sum the genre columns to see what each user likes
user_profiles = user_genre_matrix.groupby('userId')[genre_cols].sum()

# We then scale (normalize) the numbers so they are easy to compare
user_profiles = user_profiles.div(user_profiles.sum(axis=1), axis=0)

In [None]:
# Re-importing after the restart
import pandas as pd
from surprise import SVD, Dataset, Reader

# Re-loading data (since it was forgotten)
ratings = pd.read_csv('ml-latest-small/ratings.csv')
# ... re-run your train/test split code here ...

# NOW run the Step 5 model training
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_set[['userId', 'movieId', 'rating']], reader)
trainset_surprise = data.build_full_trainset()

model = SVD()
model.fit(trainset_surprise)
print("Model trained successfully!")

In [None]:
def get_hybrid_recommendations(user_id, top_n=10):
    # Ensure IDs are integers
    ratings['movieId'] = ratings['movieId'].astype(int)
    movies['movieId'] = movies['movieId'].astype(int)

    seen = set(ratings[ratings['userId'] == user_id]['movieId'])
    unseen_df = movies[~movies['movieId'].isin(seen)].copy()

    # Collaborative Score
    unseen_df['svd_score'] = unseen_df['movieId'].apply(lambda x: model.predict(user_id, x).est)

    # Content Score (Drama/Genre correction)
    genre_cols = genres_split.columns.tolist()
    u_weights = user_profiles.loc[user_id].reindex(genre_cols).fillna(0)
    unseen_df['content_score'] = movies_encoded.loc[unseen_df.index, genre_cols].dot(u_weights) * 5

    # Final Hybrid Score
    unseen_df['final_score'] = (unseen_df['svd_score'] * 0.7) + (unseen_df['content_score'] * 0.3)

    # Return 3 values: ID, SVD Part, and Content Part (to match your loop)
    # We sort by the final_score
    top_df = unseen_df.sort_values(by='final_score', ascending=False).head(top_n)

    # This creates a list of tuples [(id, score1, score2), ...]
    return list(zip(top_df['movieId'], top_df['svd_score'], top_df['content_score']))

In [None]:
from surprise import accuracy

# 1. Prepare the test set for Surprise
# We use the 'test_set' we created in Step 2
testset_for_surprise = list(zip(test_set['userId'], test_set['movieId'], test_set['rating']))

# 2. Predict ratings for the test set
predictions = svd_model.test(testset_for_surprise)

# 3. Calculate RMSE
rmse_score = accuracy.rmse(predictions)
print(f"Model Accuracy (RMSE): {rmse_score:.4f}")

In [None]:
def precision_at_k(predictions, k=10, threshold=4.0):
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    precisions = []
    for uid, user_ratings in user_est_true.items():
        # Sort by predicted rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items in top k
        n_rel_and_rec_k = sum((true_r >= threshold) for (est, true_r) in user_ratings[:k])
        # Precision = Relevant & Recommended / Recommended
        precisions.append(n_rel_and_rec_k / k)

    return sum(precisions) / len(precisions)

p_at_10 = precision_at_k(predictions, k=10)
print(f"Precision at 10: {p_at_10:.2%}")

In [None]:
def get_explanation(user_id, movie_id):
    # 1. Get the genres for the specific movie
    # We split by the pipe symbol we discussed earlier
    movie_genres = set(movies[movies['movieId'] == movie_id]['genres'].iloc[0].split('|'))

    # 2. Find the user's favorite genre from their profile (Step 4)
    # This identifies the column with the highest weight for that user
    user_top_genre = user_profiles.loc[user_id].idxmax()

    # 3. Create the natural language reason
    if user_top_genre in movie_genres:
        return f"Because you are a big fan of {user_top_genre} movies!"
    else:
        # Fallback: Find any genre they have in common
        # This uses simple set intersection
        common = list(movie_genres.intersection(set(user_profiles.columns[user_profiles.loc[user_id] > 0])))
        if common:
            return f"Matches your interest in {common[0]}."
        else:
            return "Recommended based on similar users' high ratings."

In [None]:
# 1. Filter for User 610's 5-star ratings
f = ratings[(ratings['userId'] == 100) & (ratings['rating'] == 5.0)]

# 2. Merge with your encoded genres
merged_data = pd.merge(f, movies_encoded, on='movieId')

# 3. Dynamically get all genre columns (everything except the basic info)
# This ensures we don't miss "Horror", "Musical", "Western", etc.
all_cols = ['userId', 'movieId', 'title', 'rating'] + list(genres_split.columns)

# 4. Display the full table
display(merged_data[all_cols])

In [None]:
print(len(train_set[train_set['userId'] == 340]))

In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML

# 1. The Corrected Logic Function
def get_hybrid_recommendations(user_id, top_n=10):
    # Ensure IDs are integers for matching
    ratings['movieId'] = ratings['movieId'].astype(int)
    movies['movieId'] = movies['movieId'].astype(int)

    # Identify unseen movies
    seen = set(ratings[ratings['userId'] == user_id]['movieId'])
    unseen_df = movies[~movies['movieId'].isin(seen)].copy()

    # A. SVD Part (Collaborative)
    unseen_df['svd_score'] = unseen_df['movieId'].apply(lambda x: model.predict(user_id, x).est)

    # B. Content Part (Genre Correction for User 610/450)
    genre_cols = genres_split.columns.tolist()
    u_weights = user_profiles.loc[user_id].reindex(genre_cols).fillna(0)
    unseen_df['content_score'] = movies_encoded.loc[unseen_df.index, genre_cols].dot(u_weights) * 5

    # C. Hybrid Formula
    unseen_df['final_score'] = (unseen_df['svd_score'] * 0.9) + (unseen_df['content_score'] * 0.1)

    top_df = unseen_df.sort_values(by='final_score', ascending=False).head(top_n)

    # RETURN THREE VALUES (Fixes the ValueError)
    return list(zip(top_df['movieId'], top_df['svd_score'], top_df['content_score']))

# 2. The UI and Display Cell
user_input = widgets.IntText(value=1, description='User ID:')
button = widgets.Button(description="Get Recommendations", button_style='info')
output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        uid = user_input.value

        try:
            # We are now unpacking EXACTLY 3 values: m_id, pred_score, match_score
            recommendations = get_hybrid_recommendations(uid)

            display(HTML(f"<h3>--- ReelSense Top 10 for User {uid} ---</h3>"))

            for i, (m_id, pred_score, match_score) in enumerate(recommendations):
                title = movies[movies['movieId'] == m_id]['title'].values[0]
                final_score = (0.7 * pred_score) + (0.3 * match_score)

                # Dynamic Explanation
                genre_cols = genres_split.columns.tolist()
                m_genres = movies_encoded[movies_encoded['movieId'] == m_id][genre_cols].iloc[0]
                u_prof = user_profiles.loc[uid].reindex(genre_cols).fillna(0)
                top_genre = (m_genres * u_prof).idxmax()

                display(HTML(f"""
                    <div style="border-left: 5px solid #2196F3; padding: 10px; margin: 5px; background-color: #f1f1f1; border-radius: 5px;">
                        <b style="color: #333;">{i+1}. {title}</b><br>
                        <span style="color: #555; font-size: 0.9em;">Match Reason: Strong affinity for <b>{top_genre}</b> movies.</span><br>
                        <small style="color: #888;">Strength: {final_score:.2f} | SVD: {pred_score:.1f} | Genre Match: {match_score:.1f}</small>
                    </div>
                """))
        except Exception as e:
            print(f"Error encountered: {e}")
            print("Action: Ensure your SVD model and user_profiles cells have been run.")

button.on_click(on_button_clicked)
display(widgets.VBox([user_input, button, output]))