In [None]:
import pandas as pd
import numpy as np
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout
from tensorflow.python.keras.optimizers import adam_v2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from collections import defaultdict
import time


In [None]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
class MovieRecommendationSystem:
    def __init__(self):
        # Initialize data structures
        self.movies_df = None
        self.ratings_df = None
        self.users_df = None
        self.content_features = None
        self.user_item_matrix = None
        self.item_similarity = None
        self.content_model = None
        self.collaborative_model = None
        self.hybrid_model = None
        self.batch_size = 5000
        # User profiles storage
        self.user_profiles = {}
        self.user_history = defaultdict(list)

    def load_data(self, movies_path, ratings_path, users_path=None):

        print("Loading data...")
        self.movies_df = pd.read_csv(movies_path)
        self.movies_df = self.movies_df.head(self.batch_size)


        # Check if 'movieId' column exists, if not, try 'id'
        if 'movieId' not in self.movies_df.columns:
            if 'id' in self.movies_df.columns:
                # Rename 'id' to 'movieId'
                self.movies_df = self.movies_df.rename(columns={'id': 'movieId'})
            else:
                raise KeyError("Neither 'movieId' nor 'id' column found in movies_metadata.csv")
        self.ratings_df = pd.read_csv(ratings_path)

        if users_path:
            self.users_df = pd.read_csv(users_path)

        print(f"Loaded {len(self.movies_df)} movies and {len(self.ratings_df)} ratings")

    def preprocess_data(self):

        print("Preprocessing data...")

        # Clean movie titles
        self.movies_df['clean_title'] = self.movies_df['title'].astype(str).apply(lambda x: re.sub(r'\(\d{4}\)', '', x).strip())

        # Extract year from title
        self.movies_df['year'] = self.movies_df['title'].astype(str).apply(
            lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else None
        )

        # Convert genres from pipe-separated to list
        self.movies_df['genres_list'] = self.movies_df['genres'].str.split('|')

        # user-item matrix for collaborative filtering
        # self.user_item_matrix = self.ratings_df.pivot(                           #here i am
        #     columns='movieId',
        #     values='rating'
        # ).fillna(0)
        print(self.ratings_df.columns)

        user_ids = self.ratings_df['userId'].unique()
        movie_ids = self.ratings_df['movieId'].unique()

        user_mapper = {user_id: i for i, user_id in enumerate(user_ids)}
        movie_mapper = {movie_id: i for i, movie_id in enumerate(movie_ids)}

        row = self.ratings_df['userId'].map(user_mapper)
        col = self.ratings_df['movieId'].map(movie_mapper)
        data = self.ratings_df['rating']

        self.user_item_matrix = csr_matrix((data, (row, col)), shape=(len(user_ids), len(movie_ids)))
        print("Data preprocessing complete")

    def build_content_features(self):
        """Build content features for content-based filtering"""
        print("Building content features...")

        # genres to TF-IDF features
        genres_str = self.movies_df['genres'].fillna('')
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(genres_str)

        # DataFrame with movie IDs and genre features
        self.content_features = pd.DataFrame(
            tfidf_matrix.toarray(),
            index=self.movies_df['movieId']
        )

        # item similarity matrix
        self.item_similarity = cosine_similarity(tfidf_matrix)

        print("Content features built successfully")

    def create_exemplar_profiles(self, num_profiles=5):
        """exemplar user profiles for demonstration"""
        print("Creating exemplar user profiles...")

        # Define some profile types
        profile_types = [
            "Action Lover",
            "Drama Enthusiast",
            "Comedy Fan",
            "Sci-Fi Geek",
            "Horror Buff"
        ]

        # Create profiles with genre preferences
        for i, profile_type in enumerate(profile_types[:num_profiles]):
            user_id = f"exemplar_{i+1}"

            # Set genre preferences based on profile type
            if "Action" in profile_type:
                genre_prefs = ["Action", "Adventure", "Thriller"]
            elif "Drama" in profile_type:
                genre_prefs = ["Drama", "Romance", "Biography"]
            elif "Comedy" in profile_type:
                genre_prefs = ["Comedy", "Animation", "Romance"]
            elif "Sci-Fi" in profile_type:
                genre_prefs = ["Sci-Fi", "Fantasy", "Adventure"]
            elif "Horror" in profile_type:
                genre_prefs = ["Horror", "Thriller", "Mystery"]

            # Store the profile
            self.user_profiles[user_id] = {
                "name": profile_type,
                "preferred_genres": genre_prefs,
                "disliked_genres": [],
                "favorite_movies": [],
                "rating_history": {}
            }

        print(f"Created {len(self.user_profiles)} exemplar profiles")
        return list(self.user_profiles.keys())

    def update_user_profile(self, user_id, movie_id, rating):
        """Update user profile based on movie rating"""
        # user profile
        if user_id not in self.user_profiles:
            self.user_profiles[user_id] = {
                "name": f"User_{user_id}",
                "preferred_genres": [],
                "disliked_genres": [],
                "favorite_movies": [],
                "rating_history": {}
            }

        #  rating history
        self.user_profiles[user_id]["rating_history"][movie_id] = rating

        #  favorite movies list if rating is high
        if rating >= 4.0:
            if movie_id not in self.user_profiles[user_id]["favorite_movies"]:
                self.user_profiles[user_id]["favorite_movies"].append(movie_id)

            #  preferred genres
            movie_genres = self.movies_df[self.movies_df['movieId'] == movie_id]['genres'].values[0].split('|')
            for genre in movie_genres:
                if genre not in self.user_profiles[user_id]["preferred_genres"]:
                    self.user_profiles[user_id]["preferred_genres"].append(genre)

        #  disliked genres if rating is low
        elif rating <= 2.0:
            movie_genres = self.movies_df[self.movies_df['movieId'] == movie_id]['genres'].values[0].split('|')
            for genre in movie_genres:
                if genre not in self.user_profiles[user_id]["disliked_genres"]:
                    self.user_profiles[user_id]["disliked_genres"].append(genre)

        # Add to user history with timestamp for real-time recommendations
        self.user_history[user_id].append({
            "movie_id": movie_id,
            "rating": rating,
            "timestamp": time.time()
        })

    def content_based_recommendation(self, user_id, n=10):
        """ content-based recommendations """
        # Get user profile
        if user_id not in self.user_profiles:
            print(f"User {user_id} not found")
            return []

        user_profile = self.user_profiles[user_id]
        preferred_genres = user_profile["preferred_genres"]

        # If no preferred genres, return popular movies
        if not preferred_genres:
            print(f"No genre preferences for user {user_id}, returning popular movies")
            # Calculate popularity based on rating count and average
            popular_movies = self.ratings_df.groupby('movieId').agg(
                rating_count=('rating', 'count'),
                rating_mean=('rating', 'mean')
            ).reset_index()
            popular_movies['popularity'] = popular_movies['rating_count'] * popular_movies['rating_mean']
            popular_movies = popular_movies.sort_values('popularity', ascending=False)

            # Get top n movie IDs
            top_movies = popular_movies.head(n)['movieId'].tolist()
            return top_movies

        # Calculate genre match score for each movie
        genre_scores = []

        for _, movie in self.movies_df.iterrows():
            movie_genres = movie['genres'].split('|')
            score = 0

            # Calculate score based on matching preferred genres
            for genre in movie_genres:
                if genre in preferred_genres:
                    score += 1
                if genre in user_profile["disliked_genres"]:
                    score -= 1

            # Avoid recommending already rated movies
            if movie['movieId'] in user_profile["rating_history"]:
                continue

            genre_scores.append((movie['movieId'], score))

        # Sort by score and get top n
        genre_scores.sort(key=lambda x: x[1], reverse=True)
        top_movies = [movie_id for movie_id, _ in genre_scores[:n]]

        return top_movies

    def collaborative_filtering_recommendation(self, user_id, n=10):
        """collaborative filtering recommendations """
        # Get user ratings or create an entry if user is new
        if user_id not in self.user_profiles:
            print(f"User {user_id} not found")
            return []

        user_ratings = self.user_profiles[user_id]["rating_history"]

        # If no ratings, return popular movies
        if not user_ratings:
            print(f"No rating history for user {user_id}, returning popular movies")
            popular_movies = self.ratings_df.groupby('movieId').agg(
                rating_count=('rating', 'count'),
                rating_mean=('rating', 'mean')
            ).reset_index()
            popular_movies['popularity'] = popular_movies['rating_count'] * popular_movies['rating_mean']
            popular_movies = popular_movies.sort_values('popularity', ascending=False)

            top_movies = popular_movies.head(n)['movieId'].tolist()
            return top_movies

        # Create similarity scores
        similarity_scores = []

        # For each movie the user has rated
        for movie_id, user_rating in user_ratings.items():
            # Find similar movies based on item similarity matrix
            movie_idx = self.movies_df[self.movies_df['movieId'] == movie_id].index
            if len(movie_idx) == 0:
                continue

            movie_idx = movie_idx[0]

            # Get similarity scores for this movie with all others
            similar_movies = list(enumerate(self.item_similarity[movie_idx]))

            # Filter out already rated movies
            similar_movies = [(i, score) for i, score in similar_movies
                              if self.movies_df.iloc[i]['movieId'] not in user_ratings]

            # Weight by user rating
            similar_movies = [(i, score * user_rating) for i, score in similar_movies]

            # Add to overall similarity scores
            similarity_scores.extend(similar_movies)

        # Aggregate scores for same movies
        movie_score_dict = {}
        for i, score in similarity_scores:
            movie_id = self.movies_df.iloc[i]['movieId']
            if movie_id in movie_score_dict:
                movie_score_dict[movie_id] += score
            else:
                movie_score_dict[movie_id] = score

        # Sort by score
        movie_scores = [(movie_id, score) for movie_id, score in movie_score_dict.items()]
        movie_scores.sort(key=lambda x: x[1], reverse=True)

        # Get top n movies
        top_movies = [movie_id for movie_id, _ in movie_scores[:n]]

        return top_movies

    def hybrid_recommendation(self, user_id, content_weight=0.5, n=10):
        """ hybrid recommendations combining content-based a nd collaborative filtering"""
        # Get recommendations from both methods
        content_recs = self.content_based_recommendation(user_id, n=n*2)
        collab_recs = self.collaborative_filtering_recommendation(user_id, n=n*2)

        # Combine the two lists with weights
        content_dict = {movie_id: content_weight * (n*2 - i)
                       for i, movie_id in enumerate(content_recs)}

        collab_dict = {movie_id: (1-content_weight) * (n*2 - i)
                      for i, movie_id in enumerate(collab_recs)}

        # Combine scores
        hybrid_scores = defaultdict(float)
        for movie_id, score in content_dict.items():
            hybrid_scores[movie_id] += score

        for movie_id, score in collab_dict.items():
            hybrid_scores[movie_id] += score

        # Sort by combined score
        movie_scores = [(movie_id, score) for movie_id, score in hybrid_scores.items()]
        movie_scores.sort(key=lambda x: x[1], reverse=True)

        # Get top n movies
        top_movies = [movie_id for movie_id, _ in movie_scores[:n]]

        return top_movies

    def real_time_recommendation(self, user_id, n=10):
        """real-time recommendations based on recent activity"""
        # Check for recent history
        if user_id not in self.user_history or not self.user_history[user_id]:
            # Fallback to hybrid recommendations if no recent history
            return self.hybrid_recommendation(user_id, n=n)

        # Sort history by timestamp (most recent first)
        recent_history = sorted(self.user_history[user_id], key=lambda x: x["timestamp"], reverse=True)

        # Get most recent positively rated movies (last 24 hours)
        current_time = time.time()
        day_in_seconds = 24 * 60 * 60
        recent_movies = [h["movie_id"] for h in recent_history
                        if h["rating"] >= 3.5 and current_time - h["timestamp"] < day_in_seconds]

        # If no recent positive ratings, fall back to hybrid
        if not recent_movies:
            return self.hybrid_recommendation(user_id, n=n)

        # Find similar movies to recently rated ones
        similar_movies = []

        for movie_id in recent_movies:
            movie_idx = self.movies_df[self.movies_df['movieId'] == movie_id].index
            if len(movie_idx) == 0:
                continue

            movie_idx = movie_idx[0]

            # Get similarity scores
            similar = list(enumerate(self.item_similarity[movie_idx]))

            # Filter already rated or recently seen
            rated_movies = list(self.user_profiles[user_id]["rating_history"].keys())
            similar = [(i, score) for i, score in similar
                      if self.movies_df.iloc[i]['movieId'] not in rated_movies
                      and self.movies_df.iloc[i]['movieId'] not in recent_movies]

            similar_movies.extend(similar)

        # Aggregate scores
        movie_score_dict = {}
        for i, score in similar_movies:
            movie_id = self.movies_df.iloc[i]['movieId']
            if movie_id in movie_score_dict:
                movie_score_dict[movie_id] = max(movie_score_dict[movie_id], score)  # Take max similarity
            else:
                movie_score_dict[movie_id] = score

        # Sort by score
        movie_scores = [(movie_id, score) for movie_id, score in movie_score_dict.items()]
        movie_scores.sort(key=lambda x: x[1], reverse=True)

        # Take top n
        top_movies = [movie_id for movie_id, _ in movie_scores[:n]]

        # If we don't have enough, supplement with hybrid recommendations
        if len(top_movies) < n:
            hybrid_recs = self.hybrid_recommendation(user_id, n=n)
            for movie_id in hybrid_recs:
                if movie_id not in top_movies:
                    top_movies.append(movie_id)
                    if len(top_movies) >= n:
                        break

        return top_movies

    def build_deep_learning_model(self):
        """ deep learning model for collaborative filtering"""
        print("Building deep learning model...")

        # Prepare data
        ratings_data = pd.DataFrame(self.ratings_df)

        # Encode user and movie IDs
        user_encoder = LabelEncoder()
        movie_encoder = LabelEncoder()

        ratings_data['user_encoded'] = user_encoder.fit_transform(ratings_data['userId'])
        ratings_data['movie_encoded'] = movie_encoder.fit_transform(ratings_data['movieId'])

        # Get number of users and movies for embedding layers
        n_users = ratings_data['user_encoded'].nunique()
        n_movies = ratings_data['movie_encoded'].nunique()

        # Split data
        train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=42)

        # Define model architecture
        # User input
        user_input = Input(shape=(1,), name='user_input')
        user_embedding = Embedding(n_users, 50, name='user_embedding')(user_input)
        user_vec = Flatten(name='user_flatten')(user_embedding)

        # Movie input
        movie_input = Input(shape=(1,), name='movie_input')
        movie_embedding = Embedding(n_movies, 50, name='movie_embedding')(movie_input)
        movie_vec = Flatten(name='movie_flatten')(movie_embedding)

        # Concatenate features
        concat = Concatenate()([user_vec, movie_vec])

        # Dense layers
        dense1 = Dense(128, activation='relu')(concat)
        dropout1 = Dropout(0.2)(dense1)
        dense2 = Dense(64, activation='relu')(dropout1)
        dropout2 = Dropout(0.2)(dense2)

        # Output layer
        output = Dense(1)(dropout2)

        # Create and compile model
        model = Model([user_input, movie_input], output)
        model.compile(loss='mean_squared_error', optimizer=adam_v2(learning_rate=0.001))

        # Train model
        print("Training model...")
        history = model.fit(
            [train_data['user_encoded'], train_data['movie_encoded']],
            train_data['rating'],
            batch_size=64,
            epochs=5,
            validation_split=0.1,
            verbose=1
        )

        # Evaluate model
        eval_results = model.evaluate(
            [test_data['user_encoded'], test_data['movie_encoded']],
            test_data['rating'],
            verbose=0
        )
        print(f"Test RMSE: {np.sqrt(eval_results)}")

        # Store the model and encoders
        self.collaborative_model = model
        self.user_encoder = user_encoder
        self.movie_encoder = movie_encoder

        print("Deep learning model built successfully")

    def get_movie_details(self, movie_ids):
        """Get movie details for a list of movie IDs"""
        movie_details = []

        for movie_id in movie_ids:
            movie = self.movies_df[self.movies_df['movieId'] == movie_id]
            if not movie.empty:
                movie_details.append({
                    'movieId': movie_id,
                    'title': movie['title'].values[0],
                    'genres': movie['genres'].values[0]
                })

        return movie_details

    def recommend(self, user_id, method='hybrid', n=10):
        """Main recommendation method"""
        if method == 'content':
            movie_ids = self.content_based_recommendation(user_id, n)
        elif method == 'collaborative':
            movie_ids = self.collaborative_filtering_recommendation(user_id, n)
        elif method == 'real-time':
            movie_ids = self.real_time_recommendation(user_id, n)
        else:
            movie_ids = self.hybrid_recommendation(user_id, n=n)

        movies = self.get_movie_details(movie_ids)
        return movies


In [None]:
if __name__ == "__main__":



    # Create recommendation system
    recommender = MovieRecommendationSystem()

    # Load data
    recommender.load_data(
        movies_path="/content/drive/MyDrive/personalizedrecommendationsystem/movies_metadata.csv",
        ratings_path="/content/drive/MyDrive/personalizedrecommendationsystem/ratings.csv"
    )

    # Preprocess data
    recommender.preprocess_data()

    # Build content features
    recommender.build_content_features()

    # Create exemplar profiles
    user_ids = recommender.create_exemplar_profiles()

    # Simulate some user ratings for the first profile
    user_id = user_ids[0]

    # Rate some action movies highly
    action_movies = recommender.movies_df[recommender.movies_df['genres'].str.contains('Action')]['movieId'].head(5).tolist()
    for movie_id in action_movies:
        recommender.update_user_profile(user_id, movie_id, 5.0)

    # Rate some drama movies poorly
    drama_movies = recommender.movies_df[recommender.movies_df['genres'].str.contains('Drama')]['movieId'].head(3).tolist()
    for movie_id in drama_movies:
        recommender.update_user_profile(user_id, movie_id, 1.5)

    # Generate recommendations using different methods
    print("\nContent-based recommendations:")
    content_recs = recommender.recommend(user_id, method='content', n=5)
    for i, movie in enumerate(content_recs, 1):
        print(f"{i}. {movie['title']} - {movie['genres']}")

    print("\nCollaborative filtering recommendations:")
    collab_recs = recommender.recommend(user_id, method='collaborative', n=5)
    for i, movie in enumerate(collab_recs, 1):
        print(f"{i}. {movie['title']} - {movie['genres']}")

    print("\nHybrid recommendations:")
    hybrid_recs = recommender.recommend(user_id, method='hybrid', n=5)
    for i, movie in enumerate(hybrid_recs, 1):
        print(f"{i}. {movie['title']} - {movie['genres']}")

    print("\nReal-time recommendations:")
    realtime_recs = recommender.recommend(user_id, method='real-time', n=5)
    for i, movie in enumerate(realtime_recs, 1):
        print(f"{i}. {movie['title']} - {movie['genres']}")



Loading data...


  self.movies_df = pd.read_csv(movies_path)


Loaded 5000 movies and 26024289 ratings
Preprocessing data...
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Data preprocessing complete
Building content features...
Content features built successfully
Creating exemplar user profiles...
Created 5 exemplar profiles

Content-based recommendations:
1. Drop Zone - [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 53, 'name': 'Thriller'}]
2. Street Fighter - [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 53, 'name': 'Thriller'}]
3. Cliffhanger - [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 53, 'name': 'Thriller'}]
4. Mission: Impossible - [{'id': 12, 'name': 'Adventure'}, {'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]
5. The Quest - [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}]

Collaborative filtering recommendations:
1. Drop Zone - [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 53, 'name': 'T

In [None]:
recommender.save('recommender.h5')

AttributeError: 'MovieRecommendationSystem' object has no attribute 'save'