# **Matrix Factorization using Gradient Descent**

# Anuj Patel(002874710)

Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
from tqdm import tqdm
import os
import pickle

warnings.filterwarnings('ignore')


 Load and Preview the Data

In [2]:
# Load the dataset
df = pd.read_csv("C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/data/Final_data/Final_data.csv")

# Preview the first few rows
print("Data preview:")
print(df.head())

# Check the shape of the dataset
print("\nDataset shape:", df.shape)


Data preview:
   UserID  MovieID  Rating                 Title
0       1      122     5.0      Boomerang (1992)
1       1      185     5.0       Net, The (1995)
2       1      231     5.0  Dumb & Dumber (1994)
3       1      292     5.0       Outbreak (1995)
4       1      316     5.0       Stargate (1994)

Dataset shape: (10000054, 4)


In [3]:
# Load and examine the dataset
df = pd.read_csv("C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/data/Final_data/Final_data.csv")

# Basic information
print("Data preview:")
print(df.head())
print("\nDataset shape:", df.shape)

# Additional data examination
print("\nDataset Information:")
print("-------------------")
print("Number of unique users:", df['UserID'].nunique())
print("Number of unique movies:", df['MovieID'].nunique())
print("Rating statistics:")
print(df['Rating'].describe())
print("\nMissing values:")
print(df.isnull().sum())
print("\nRating distribution:")
print(df['Rating'].value_counts().sort_index())

# Check data types
print("\nData types:")
print(df.dtypes)

# Check value ranges
print("\nValue ranges:")
print("UserID range:", df['UserID'].min(), "to", df['UserID'].max())
print("MovieID range:", df['MovieID'].min(), "to", df['MovieID'].max())
print("Rating range:", df['Rating'].min(), "to", df['Rating'].max())

Data preview:
   UserID  MovieID  Rating                 Title
0       1      122     5.0      Boomerang (1992)
1       1      185     5.0       Net, The (1995)
2       1      231     5.0  Dumb & Dumber (1994)
3       1      292     5.0       Outbreak (1995)
4       1      316     5.0       Stargate (1994)

Dataset shape: (10000054, 4)

Dataset Information:
-------------------
Number of unique users: 69878
Number of unique movies: 10677
Rating statistics:
count    1.000005e+07
mean     3.512422e+00
std      1.060418e+00
min      5.000000e-01
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: Rating, dtype: float64

Missing values:
UserID     0
MovieID    0
Rating     0
Title      0
dtype: int64

Rating distribution:
Rating
0.5      94988
1.0     384180
1.5     118278
2.0     790306
2.5     370178
3.0    2356676
3.5     879764
4.0    2875850
4.5     585022
5.0    1544812
Name: count, dtype: int64

Data types:
UserID       int64
MovieID      int

Data Preparation - Map User and Movie IDs to Indices

In [4]:
# Cell 2: Map IDs to Indices
print("\nMapping IDs to indices...")
user_ids = df['UserID'].unique()
movie_ids = df['MovieID'].unique()


Mapping IDs to indices...


In [5]:
# Create forward mappings
user2idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie2idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

# Create reverse mappings
idx2user = {idx: user_id for user_id, idx in user2idx.items()}
idx2movie = {idx: movie_id for movie_id, idx in movie2idx.items()}

# Map indices to dataframe
df['user_idx'] = df['UserID'].map(user2idx)
df['movie_idx'] = df['MovieID'].map(movie2idx)

Split Data into Training and Testing Sets

In [6]:
from sklearn.model_selection import train_test_split
print("\nSplitting data...")

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


Splitting data...


Ensure All Users and Movies in Test Set are in Training Set

In [7]:
print("\nFiltering test set...")
train_users = set(train_data['user_idx'])
train_movies = set(train_data['movie_idx'])
test_data = test_data[test_data['user_idx'].isin(train_users) & test_data['movie_idx'].isin(train_movies)]
test_data = test_data.reset_index(drop=True)

print("Number of users in training set:", len(train_users))
print("Number of movies in training set:", len(train_movies))
print("Number of users in test set:", test_data['user_idx'].nunique())
print("Number of movies in test set:", test_data['movie_idx'].nunique())



Filtering test set...
Number of users in training set: 69878
Number of movies in training set: 10653
Number of users in test set: 69796
Number of movies in test set: 10194


Initialize Latent Factor Matrices

In [8]:
print("\nInitializing matrices...")
num_users = len(user_ids)
num_movies = len(movie_ids)
num_factors = 20

np.random.seed(42)
P = np.random.normal(scale=0.1, size=(num_users, num_factors))
Q = np.random.normal(scale=0.1, size=(num_movies, num_factors))

print("User latent factor matrix shape:", P.shape)
print("Item latent factor matrix shape:", Q.shape)



Initializing matrices...
User latent factor matrix shape: (69878, 20)
Item latent factor matrix shape: (10677, 20)


Define Functions for RMSE and Training

In [9]:
# Cell 6: Define Functions
def validate_input_data(train_data, test_data, P, Q):
    """Validate input data and matrix dimensions"""
    if not isinstance(train_data, pd.DataFrame) or not isinstance(test_data, pd.DataFrame):
        raise TypeError("Train and test data must be pandas DataFrames")
    
    required_cols = ['user_idx', 'movie_idx', 'Rating']
    for col in required_cols:
        if col not in train_data.columns or col not in test_data.columns:
            raise ValueError(f"Missing required column: {col}")
    
    if train_data['Rating'].min() < 0 or test_data['Rating'].min() < 0:
        raise ValueError("Ratings cannot be negative")
    
    max_user_idx = max(train_data['user_idx'].max(), test_data['user_idx'].max())
    max_movie_idx = max(train_data['movie_idx'].max(), test_data['movie_idx'].max())
    
    if max_user_idx >= P.shape[0]:
        raise ValueError(f"User index {max_user_idx} out of bounds for P matrix with shape {P.shape}")
    if max_movie_idx >= Q.shape[0]:
        raise ValueError(f"Movie index {max_movie_idx} out of bounds for Q matrix with shape {Q.shape}")

def compute_rmse(data, P, Q):
    """Compute Root Mean Square Error"""
    errors = []
    for index, row in data.iterrows():
        user = int(row['user_idx'])
        item = int(row['movie_idx'])
        rating = row['Rating']
        
        prediction = np.clip(np.dot(P[user, :], Q[item, :].T), 0.5, 5.0)
        error = rating - prediction
        errors.append(error**2)
    
    rmse = np.sqrt(np.mean(errors))
    return rmse

# Modified error handling in train_matrix_factorization
def train_matrix_factorization(P, Q, train_data, test_data, num_factors, epochs=10, 
                             learning_rate=0.001, reg_param=0.02, early_stopping_rounds=3):
    """Train matrix factorization with early stopping and progress tracking"""
    best_rmse = float('inf')
    patience = early_stopping_rounds
    patience_counter = 0
    best_P, best_Q = P.copy(), Q.copy()  # Initialize with current values
    
    try:
        for epoch in range(epochs):
            print(f"\nEpoch: {epoch+1}")
            total_error = 0
            n_samples = len(train_data)
            
            for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Training"):
                user = int(row['user_idx'])
                item = int(row['movie_idx'])
                rating = float(row['Rating'])
                
                prediction = np.clip(np.dot(P[user, :], Q[item, :].T), 0.5, 5.0)
                error = rating - prediction
                
                P[user, :] += learning_rate * (error * Q[item, :] - reg_param * P[user, :])
                Q[item, :] += learning_rate * (error * P[user, :] - reg_param * Q[item, :])
                
                total_error += error ** 2
            
            train_rmse = np.sqrt(total_error / n_samples)
            test_rmse = compute_rmse(test_data, P, Q)
            
            print(f"Training RMSE: {train_rmse:.4f}")
            print(f"Test RMSE: {test_rmse:.4f}")
            
            if test_rmse < best_rmse:
                best_rmse = test_rmse
                patience_counter = 0
                best_P, best_Q = P.copy(), Q.copy()
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"\nEarly stopping triggered after epoch {epoch+1}")
                    return best_P, best_Q
            
            learning_rate *= 0.95
        
        return best_P, best_Q
        
    except Exception as e:
        print(f"An error occurred during training: {str(e)}")
        # Return the best model we had before the error
        return best_P, best_Q

Train the Matrix Factorization Model

In [10]:
# Cell 6: Train Model and Save Results
print("\nTraining model...")
epochs = 1
learning_rate = 0.001
reg_param = 0.02
early_stopping_rounds = 3

# Define the path for saving
save_path = r'C:\Users\anujp\OneDrive\Desktop\MovieRecommendations\models\matrix_factorization\stochastic_gradient_descent'

try:
    P_trained, Q_trained = train_matrix_factorization(
        P=P,
        Q=Q,
        train_data=train_data,
        test_data=test_data,
        num_factors=num_factors,
        epochs=epochs,
        learning_rate=learning_rate,
        reg_param=reg_param,
        early_stopping_rounds=early_stopping_rounds
    )
    
    # Save trained matrices
    with open(os.path.join(save_path, 'P_trained.pkl'), 'wb') as f:
        pickle.dump(P_trained, f)
    
    with open(os.path.join(save_path, 'Q_trained.pkl'), 'wb') as f:
        pickle.dump(Q_trained, f)
    
    # Save all mappings
    with open(os.path.join(save_path, 'user2idx.pkl'), 'wb') as f:
        pickle.dump(user2idx, f)
        
    with open(os.path.join(save_path, 'movie2idx.pkl'), 'wb') as f:
        pickle.dump(movie2idx, f)
        
    with open(os.path.join(save_path, 'idx2user.pkl'), 'wb') as f:
        pickle.dump(idx2user, f)
        
    with open(os.path.join(save_path, 'idx2movie.pkl'), 'wb') as f:
        pickle.dump(idx2movie, f)
    
    print("Training completed and all files saved successfully!")

except Exception as e:
    print(f"An error occurred during training: {str(e)}")


Training model...

Epoch: 1


Training: 100%|██████████| 8000043/8000043 [29:30<00:00, 4518.82it/s]  


Training RMSE: 3.1935
Test RMSE: 3.1940
Training completed and all files saved successfully!


Evaluate the Model on Test Data (MAE and RMSE)

In [11]:
model_load_path = r'C:\Users\anujp\OneDrive\Desktop\MovieRecommendations\models\matrix_factorization\stochastic_gradient_descent'

# Check if the trained matrices are in the local scope, if not, try loading them
if 'P_trained' not in locals() or 'Q_trained' not in locals():
    print("Loading saved model...")
    try:
        P_trained = np.load(f'{model_load_path}\\P_matrix.npy')
        Q_trained = np.load(f'{model_load_path}\\Q_matrix.npy')
    except FileNotFoundError:
        print("Trained model files not found. Please ensure the model was saved correctly.")
        # Use the last state of P and Q if available, assuming P and Q are loaded or defined elsewhere
        P_trained, Q_trained = P, Q

# Modified compute_mae_rmse function with clipping
def compute_mae_rmse(data, P, Q):
    errors = []
    abs_errors = []
    for index, row in tqdm(data.iterrows(), total=len(data), desc="Computing metrics"):
        user = int(row['user_idx'])
        item = int(row['movie_idx'])
        rating = row['Rating']
        
        # Add clipping to match training
        prediction = np.clip(np.dot(P[user, :], Q[item, :].T), 0.5, 5.0)
        error = rating - prediction
        errors.append(error**2)
        abs_errors.append(abs(error))
    
    rmse = np.sqrt(np.mean(errors))
    mae = np.mean(abs_errors)
    return mae, rmse

# Evaluate on test data
try:
    print("\nEvaluating model on test data...")
    mae, rmse = compute_mae_rmse(test_data, P_trained, Q_trained)
    print(f"Test MAE: {mae:.4f}")
    print(f"Test RMSE: {rmse:.4f}")
    
    # Additional evaluation metrics
    print("\nComputing additional metrics...")
    print(f"Number of test samples: {len(test_data)}")
    print(f"Rating range in test data: {test_data['Rating'].min():.1f} to {test_data['Rating'].max():.1f}")
    
    # Compute prediction statistics
    predictions = []
    for index, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Computing predictions"):
        user = int(row['user_idx'])
        item = int(row['movie_idx'])
        pred = np.clip(np.dot(P_trained[user, :], Q_trained[item, :].T), 0.5, 5.0)
        predictions.append(pred)
    
    predictions = np.array(predictions)
    print(f"\nPrediction statistics:")
    print(f"Mean prediction: {predictions.mean():.4f}")
    print(f"Min prediction: {predictions.min():.4f}")
    print(f"Max prediction: {predictions.max():.4f}")
    print(f"Prediction std: {predictions.std():.4f}")

except Exception as e:
    print(f"An error occurred during evaluation: {str(e)}")


Evaluating model on test data...


Computing metrics: 100%|██████████| 1999982/1999982 [04:43<00:00, 7065.20it/s] 


Test MAE: 3.0127
Test RMSE: 3.1940

Computing additional metrics...
Number of test samples: 1999982
Rating range in test data: 0.5 to 5.0


Computing predictions: 100%|██████████| 1999982/1999982 [01:50<00:00, 18045.24it/s]


Prediction statistics:
Mean prediction: 0.5000
Min prediction: 0.5000
Max prediction: 0.6062
Prediction std: 0.0001





Compute Precision@K and Recall@K

In [12]:
def get_top_k_recommendations(P, Q, user_idx, train_data, K=10):
    user_rated_items = train_data[train_data['user_idx'] == user_idx]['movie_idx'].tolist()
    scores = np.dot(Q, P[user_idx, :])
    # Exclude items already rated by user
    scores[user_rated_items] = -np.inf
    top_k_items = np.argsort(-scores)[:K]
    return top_k_items


In [None]:
def compute_precision_recall_at_k(test_data, train_data, P, Q, K=10, threshold=4.0):
    """Compute Precision@K and Recall@K"""
    user_precisions = []
    user_recalls = []
    users_in_test = test_data['user_idx'].unique()
    
    for user in tqdm(users_in_test, desc="Computing Precision/Recall"):
        # Get actual items the user liked in test data
        actual_items = test_data[(test_data['user_idx'] == user) & 
                               (test_data['Rating'] >= threshold)]['movie_idx'].tolist()
        if not actual_items:
            continue
            
        # Get top K recommendations
        recommended_items = get_top_k_recommendations(P, Q, user, train_data, K)
        
        # Compute hits
        hits = set(actual_items) & set(recommended_items)
        precision = len(hits) / K
        recall = len(hits) / len(actual_items)
        
        user_precisions.append(precision)
        user_recalls.append(recall)
    
    # Compute average precision and recall
    avg_precision = np.mean(user_precisions)
    avg_recall = np.mean(user_recalls)
    
    return avg_precision, avg_recall

try:
    # First ensure we have the trained matrices
    if 'P_trained' not in locals() or 'Q_trained' not in locals():
        print("Loading saved model...")
        P_trained = np.load('model/P_matrix.npy')
        Q_trained = np.load('model/Q_matrix.npy')

    # Compute Precision@K and Recall@K
    print("\nComputing Precision and Recall metrics...")
    precision_at_k, recall_at_k = compute_precision_recall_at_k(
        test_data=test_data,
        train_data=train_data,
        P=P_trained,
        Q=Q_trained,
        K=10,
        threshold=4.0
    )
    
    print(f"Precision@10: {precision_at_k:.4f}")
    print(f"Recall@10: {recall_at_k:.4f}")
    
    # Additional metrics
    print("\nDetailed metrics:")
    print(f"Number of users evaluated: {len(test_data['user_idx'].unique())}")
    print(f"Number of items evaluated: {len(test_data['movie_idx'].unique())}")
    print(f"Rating threshold for 'liked' items: {4.0}")

except Exception as e:
    print(f"An error occurred during metric computation: {str(e)}")


Computing Precision and Recall metrics...


Computing Precision/Recall:  59%|█████▉    | 41393/69796 [10:54<07:11, 65.88it/s]

 Compute Normalized Discounted Cumulative Gain (NDCG)

In [None]:
def dcg_at_k(rel_scores, K):
    rel_scores = np.array(rel_scores)[:K]
    dcg = np.sum((2 ** rel_scores - 1) / np.log2(np.arange(2, rel_scores.size + 2)))
    return dcg

def idcg_at_k(rel_scores, K):
    sorted_scores = sorted(rel_scores, reverse=True)
    idcg = dcg_at_k(sorted_scores, K)
    return idcg

def compute_ndcg_at_k(test_data, train_data, P, Q, K=10):
    user_ndcgs = []
    users_in_test = test_data['user_idx'].unique()
    for user in users_in_test:
        # Get actual items and ratings the user has in test data
        user_test_data = test_data[test_data['user_idx'] == user]
        actual_items = user_test_data['movie_idx'].tolist()
        actual_ratings = user_test_data['Rating'].tolist()
        if not actual_items:
            continue
        # Get top K recommendations
        recommended_items = get_top_k_recommendations(P, Q, user, train_data, K)
        # Get relevance scores
        rel_scores = []
        for item in recommended_items:
            if item in actual_items:
                rating = user_test_data[user_test_data['movie_idx'] == item]['Rating'].values[0]
                rel = rating / 5.0  # Normalize rating to [0,1]
            else:
                rel = 0
            rel_scores.append(rel)
        dcg = dcg_at_k(rel_scores, K)
        idcg = idcg_at_k(rel_scores, K)
        ndcg = dcg / idcg if idcg > 0 else 0
        user_ndcgs.append(ndcg)
    avg_ndcg = np.mean(user_ndcgs)
    return avg_ndcg

# Compute NDCG@K
ndcg_at_k = compute_ndcg_at_k(test_data, train_data, P_trained, Q_trained, K=10)
print(f"NDCG@10: {ndcg_at_k:.4f}")


In [None]:
# Create model directory if it doesn't exist
import os
if not os.path.exists('model'):
    os.makedirs('model')
#idx2user.pkl,idx2movie,movie2idx
# Create reverse mappings
idx2user = {idx: user_id for user_id, idx in user2idx.items()}
idx2movie = {idx: movie_id for movie_id, idx in movie2idx.items()}

try:
    # Save user and item latent factors
    with open('model/P_trained.pkl', 'wb') as f:
        pickle.dump(P_trained, f)

    with open('model/Q_trained.pkl', 'wb') as f:
        pickle.dump(Q_trained, f)

    # Save user and movie mappings
    with open('model/user2idx.pkl', 'wb') as f:
        pickle.dump(user2idx, f)

    with open('model/idx2user.pkl', 'wb') as f:
        pickle.dump(idx2user, f)

    with open('model/movie2idx.pkl', 'wb') as f:
        pickle.dump(movie2idx, f)

    with open('model/idx2movie.pkl', 'wb') as f:
        pickle.dump(idx2movie, f)

    # Verify saved files
    saved_files = ['P_trained.pkl', 'Q_trained.pkl', 'user2idx.pkl', 
                   'idx2user.pkl', 'movie2idx.pkl', 'idx2movie.pkl']
    all_files_saved = all(os.path.exists(f'model/{file}') for file in saved_files)
    
    if all_files_saved:
        print("Model and mappings have been saved successfully.")
    else:
        print("Warning: Some files may not have been saved properly.")

except Exception as e:
    print(f"Error saving model: {str(e)}")

In [None]:
def predict_rating(user_id, movie_id, P, Q, user2idx, movie2idx):
   if user_id in user2idx and movie_id in movie2idx:
       user_idx = user2idx[user_id]
       movie_idx = movie2idx[movie_id]
       prediction = np.dot(P[user_idx, :], Q[movie_idx, :].T)
       # Clip the prediction to the rating scale (e.g., 1 to 5)
       prediction = min(max(prediction, 1), 5)
       return prediction
   else:
       return np.nan  # User or movie not in training data

# Example usage
# Choose any specific user ID
sample_user_id = 42  # Replace with any user ID you want
sample_movie_id = df['MovieID'].iloc[0]

predicted_rating = predict_rating(sample_user_id, sample_movie_id, P_trained, Q_trained, user2idx, movie2idx)
print(f"Predicted rating for user {sample_user_id} and movie {sample_movie_id}: {predicted_rating:.4f}")

Get Top-K Recommendations for a User

In [None]:
def recommend_top_k_movies(user_id, P, Q, user2idx, idx2movie, train_data, df, K=10):
    """
    Recommend top K movies for a user with their titles
    
    Parameters:
    -----------
    user_id : int
        The ID of the user to get recommendations for
    P : numpy array
        User latent factors matrix
    Q : numpy array
        Movie latent factors matrix
    user2idx : dict
        Mapping from user ID to matrix index
    idx2movie : dict
        Mapping from matrix index to movie ID
    train_data : pandas DataFrame
        Training data containing user-movie interactions
    df : pandas DataFrame
        Original dataframe containing movie titles
    K : int, default=10
        Number of recommendations to return
    """
    if user_id in user2idx:
        user_idx = user2idx[user_id]
        # Get movies the user has already rated
        user_rated_movies = train_data[train_data['user_idx'] == user_idx]['movie_idx'].tolist()
        # Predict scores for all movies
        scores = np.dot(Q, P[user_idx, :])
        # Exclude movies the user has already rated
        scores[user_rated_movies] = -np.inf
        # Get top K movie indices
        top_k_movie_indices = np.argsort(-scores)[:K]
        # Map indices to movie IDs
        top_k_movie_ids = [idx2movie[idx] for idx in top_k_movie_indices]
        
        # Get predicted ratings
        recommendations = []
        for movie_id in top_k_movie_ids:
            # Get movie title
            movie_title = df[df['MovieID'] == movie_id]['Title'].iloc[0]
            # Get predicted rating
            movie_idx = movie2idx[movie_id]
            predicted_rating = np.clip(np.dot(P[user_idx, :], Q[movie_idx, :].T), 0.5, 5.0)
            recommendations.append((movie_id, movie_title, predicted_rating))
        
        return recommendations
    else:
        return []

# Choose any specific user ID
user_id_to_recommend = 42  # Replace with any user ID you want
recommendations = recommend_top_k_movies(
    user_id=user_id_to_recommend,
    P=P_trained,
    Q=Q_trained,
    user2idx=user2idx,
    idx2movie=idx2movie,
    train_data=train_data,
    df=df,
    K=10
)


# Print recommendations
print(f"\nTop 10 movie recommendations for user {user_id_to_recommend}:")
if recommendations:
    for i, (movie_id, title, pred_rating) in enumerate(recommendations, 1):
        print(f"{i}. {title} (ID: {movie_id}) - Predicted Rating: {pred_rating:.2f}")
else:
    print("No recommendations available for this user.")

# Optional: Print the user's actual ratings for comparison
print(f"\nMovies this user has already rated:")
user_ratings = df[df['UserID'] == user_id_to_recommend].sort_values('Rating', ascending=False)
for _, row in user_ratings.head().iterrows():
    print(f"- {row['Title']} - Actual Rating: {row['Rating']}")

In [None]:
import pickle
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from io import BytesIO

# TMDb API configuration
TMDB_API_KEY = "165b9xxxxxx"
TMDB_BASE_URL = "https://api.themoviedb.org/3"

def load_saved_model():
    """Load the saved model and mappings"""
    try:
        with open('model/P_trained.pkl', 'rb') as f:
            P_trained = pickle.load(f)
        with open('model/Q_trained.pkl', 'rb') as f:
            Q_trained = pickle.load(f)
        with open('model/user2idx.pkl', 'rb') as f:
            user2idx = pickle.load(f)
        with open('model/idx2movie.pkl', 'rb') as f:
            idx2movie = pickle.load(f)
        with open('model/movie2idx.pkl', 'rb') as f:
            movie2idx = pickle.load(f)
            
        return P_trained, Q_trained, user2idx, idx2movie, movie2idx
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

def get_movie_info(movie_title):
    """Get movie information from TMDb API"""
    # Search for the movie
    search_url = f"{TMDB_BASE_URL}/search/movie"
    params = {
        'api_key': TMDB_API_KEY,
        'query': movie_title
    }
    response = requests.get(search_url, params=params)
    data = response.json()
    
    if data['results']:
        movie = data['results'][0]
        poster_path = movie['poster_path']
        if poster_path:
            poster_url = f"https://image.tmdb.org/t/p/w500{poster_path}"
            return movie['title'], poster_url, movie['overview'], movie['release_date']
    return None, None, None, None

def plot_recommendations_with_posters(user_id, df, P_trained, Q_trained, user2idx, idx2movie, movie2idx, K=5):
    """Plot top K movie recommendations with posters"""
    if user_id not in user2idx:
        print("User not found in the dataset")
        return
    
    user_idx = user2idx[user_id]
    scores = np.dot(Q_trained, P_trained[user_idx, :])
    top_k_indices = np.argsort(-scores)[:K]
    
    # Create figure
    fig = plt.figure(figsize=(15, 8))
    plt.suptitle(f'Top {K} Movie Recommendations for User {user_id}', fontsize=16)
    
    for i, idx in enumerate(top_k_indices, 1):
        movie_id = idx2movie[idx]
        movie_title = df[df['MovieID'] == movie_id]['Title'].iloc[0]
        predicted_rating = scores[idx]
        
        # Get movie info from TMDb
        title, poster_url, overview, release_date = get_movie_info(movie_title)
        
        # Create subplot
        ax = plt.subplot(1, K, i)
        
        if poster_url:
            # Display poster
            response = requests.get(poster_url)
            img = Image.open(BytesIO(response.content))
            ax.imshow(img)
            ax.axis('off')
            
            # Add title and rating
            ax.set_title(f"{title}\nPred Rating: {predicted_rating:.2f}", 
                        fontsize=10, pad=5)
        else:
            ax.text(0.5, 0.5, f"No poster available\n{movie_title}\nRating: {predicted_rating:.2f}",
                   ha='center', va='center')
            ax.axis('off')
    
    plt.tight_layout()
    plt.show()

def plot_user_rating_distribution(user_id, df):
    """Plot rating distribution for a specific user"""
    user_ratings = df[df['UserID'] == user_id]['Rating']
    
    plt.figure(figsize=(10, 6))
    sns.histplot(user_ratings, bins=10)
    plt.title(f'Rating Distribution for User {user_id}')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.show()
    
    print(f"\nRating Statistics for User {user_id}:")
    print(f"Average Rating: {user_ratings.mean():.2f}")
    print(f"Number of Ratings: {len(user_ratings)}")

# Load the saved model and original dataset
P_trained, Q_trained, user2idx, idx2movie, movie2idx = load_saved_model()
df = pd.read_csv("C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/data/Final_data/Final_data.csv")

# Example usage
user_id = 42  # You can change this to any user ID
plot_recommendations_with_posters(user_id, df, P_trained, Q_trained, user2idx, idx2movie, movie2idx)
plot_user_rating_distribution(user_id, df)