In [None]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
import shap

# --- Load Data ---
ratings_df = pd.read_csv('rating.csv')            # userId, movieId, rating, timestamp
movies_df = pd.read_csv('movie.csv')              # movieId, title, genres
tags_df = pd.read_csv('tag.csv')                  # userId, movieId, tag, timestamp
genome_scores_df = pd.read_csv('genome_scores.csv')  # movieId, tagId, relevance
genome_tags_df = pd.read_csv('genome_tags.csv')   # tagId, tag
links_df = pd.read_csv('link.csv')                # movieId, imdbId, tmdbId

# --- Initial Data Cleaning ---
# Remove duplicates and missing values
ratings_df.drop_duplicates(inplace=True)
ratings_df.dropna(inplace=True)

movies_df.drop_duplicates(inplace=True)
movies_df.dropna(inplace=True)

# Optional: Filter users with very few ratings (e.g., < 5)
user_rating_counts = ratings_df['userId'].value_counts()
active_users = user_rating_counts[user_rating_counts >= 5].index
ratings_df = ratings_df[ratings_df['userId'].isin(active_users)]

# --- Merge Ratings with Movie Metadata ---
ratings_merged = pd.merge(ratings_df, movies_df, on='movieId')

# --- Encode Genres using TF-IDF for Content-Based Filtering ---
# Convert genre format from "Action|Adventure|Comedy" → "Action Adventure Comedy"
movies_df['processed_genres'] = movies_df['genres'].str.replace('|', ' ', regex=False)

# Apply TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['processed_genres'])

# Compute cosine similarity between all movies based on genres
genre_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# --- Create User-Item Rating Matrix for Collaborative Filtering ---
user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')

# --- Train-Test Split (Stratified by User) ---
# Strategy: Keep a fraction of each user's ratings in the test set
def stratified_split(ratings, test_size=0.2):
    train_data = []
    test_data = []

    for user in ratings['userId'].unique():
        user_data = ratings[ratings['userId'] == user]
        if len(user_data) >= 5:
            train, test = train_test_split(user_data, test_size=test_size, random_state=42)
            train_data.append(train)
            test_data.append(test)
        else:
            train_data.append(user_data)

    return pd.concat(train_data), pd.concat(test_data)

train_df, test_df = stratified_split(ratings_df)

# --- Summary ---
print(f"Train Ratings: {train_df.shape}")
print(f"Test Ratings: {test_df.shape}")
print(f"User-Item Matrix Shape: {user_item_matrix.shape}")
print(f"Genre Similarity Matrix Shape: {genre_similarity.shape}")

# Let's say you have 'tfidf_matrix' and corresponding average ratings per movie
X = tfidf_matrix.toarray()
y = ratings_merged.groupby('movieId')['rating'].mean().reindex(movies['movieId']).fillna(3.0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# SHAP explanation
explainer = shap.Explainer(model, X_test)
shap_values = explainer(X_test)

# Visualize
shap.summary_plot(shap_values, X_test, feature_names=tfidf.get_feature_names_out())

