Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import requests
from io import BytesIO
import os
import pickle

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")

# Download NLTK resources for sentiment analysis
nltk.download('vader_lexicon', quiet=True)

True

Data preparation

In [4]:
def download_movielens_dataset(size='small'):
    """
    Download the MovieLens dataset
    size: 'small' (100K) or 'full' (25M)
    """
    if size == 'small':
        url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
    else:
        url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
    
    print(f"Downloading MovieLens {size} dataset...")
    r = requests.get(url)
    z = zipfile.ZipFile(BytesIO(r.content))
    z.extractall('data/')
    print("Dataset downloaded successfully!")
    
    if size == 'small':
        return 'data/ml-latest-small'
    else:
        return 'data/ml-latest'

def load_and_preprocess_data(dataset_path):
    """
    Load and preprocess the MovieLens dataset
    """
    print("Loading and preprocessing data...")
    
    # Load the data
    movies = pd.read_csv(f"{dataset_path}/movies.csv")
    ratings = pd.read_csv(f"{dataset_path}/ratings.csv")
    
    # Optional: Load tags if available for sentiment analysis
    try:
        tags = pd.read_csv(f"{dataset_path}/tags.csv")
    except:
        tags = None
    
    # Handle missing values in movies
    movies = movies.dropna(subset=['title', 'genres'])
    
    # Clean and preprocess movie genres
    movies['genres'] = movies['genres'].str.replace('|', ',')
    movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').fillna(0).astype(int)
    movies['clean_title'] = movies['title'].apply(lambda x: x.split('(')[0].strip())
    
    # Clean ratings data
    ratings = ratings.dropna()
    
    # Normalize ratings
    scaler = MinMaxScaler()
    ratings['normalized_rating'] = scaler.fit_transform(ratings[['rating']])
    
    # Create movie features dataframe with average rating and rating count
    movie_features = ratings.groupby('movieId').agg(
        avg_rating=('rating', 'mean'),
        rating_count=('rating', 'count')
    ).reset_index()
    
    # Merge with movies dataframe
    movies = movies.merge(movie_features, on='movieId', how='left')
    
    # Replace NaN values with 0 for rating metrics
    movies['avg_rating'] = movies['avg_rating'].fillna(0)
    movies['rating_count'] = movies['rating_count'].fillna(0)
    
    # Create genre matrix for content-based filtering
    genre_df = create_genre_matrix(movies)
    
    return movies, ratings, tags, genre_df

def create_genre_matrix(movies):
    """
    Create a matrix of movies and their genres
    """
    # Create a list of all genres
    genres = set()
    for genre_list in movies['genres'].str.split(','):
        if isinstance(genre_list, list):
            genres.update(genre_list)
    
    genres = sorted(list(genres))
    if '(no genres listed)' in genres:
        genres.remove('(no genres listed)')
    
    # Create a dataframe with one-hot encoding for genres
    genre_df = pd.DataFrame(0, index=movies.index, columns=genres)
    
    for i, genre_list in enumerate(movies['genres'].str.split(',')):
        if isinstance(genre_list, list):
            for genre in genre_list:
                if genre in genres:
                    genre_df.loc[i, genre] = 1
    
    # Add movieId column to the dataframe
    genre_df['movieId'] = movies['movieId'].values
    
    return genre_df

def prepare_data():
    """
    Main function to prepare the data
    """
    # Check if data is already downloaded
    if not os.path.exists('data/ml-latest-small'):
        dataset_path = download_movielens_dataset(size='small')
    else:
        dataset_path = 'data/ml-latest-small'
    
    # Load and preprocess the data
    movies, ratings, tags, genre_df = load_and_preprocess_data(dataset_path)
    
    # Split data into training and testing sets
    train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
    
    print(f"Data preparation complete: {len(movies)} movies, {len(ratings)} ratings")
    return movies, ratings, tags, genre_df, train_data, test_data

if __name__ == "__main__":
    movies, ratings, tags, genre_df, train_data, test_data = prepare_data()

# Create data directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# Check if data is already downloaded
if not os.path.exists('data/ml-latest-small'):
    dataset_path = download_movielens_dataset(size='small')
else:
    dataset_path = 'data/ml-latest-small'

# Load and preprocess the data
movies, ratings, tags, genre_df = load_and_preprocess_data(dataset_path)

# Split data into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

Loading and preprocessing data...
Data preparation complete: 9742 movies, 100836 ratings
Loading and preprocessing data...


Collaborative Filtering

In [6]:
class CollaborativeFiltering:
    def __init__(self, ratings_df=None):
        self.model = None
        self.ratings_df = ratings_df
        self.is_trained = False
        self.user_factors = None
        self.item_factors = None
        self.user_mapping = None
        self.item_mapping = None
        self.reverse_user_mapping = None
        self.reverse_item_mapping = None
        
    def prepare_data(self, ratings_df=None):
        """
        Prepare data for SVD model
        """
        if ratings_df is not None:
            self.ratings_df = ratings_df
            
        if self.ratings_df is None:
            raise ValueError("Ratings dataframe is required")
            
        # Create user and item mapping dictionaries
        unique_users = self.ratings_df['userId'].unique()
        unique_movies = self.ratings_df['movieId'].unique()
        
        self.user_mapping = {user_id: i for i, user_id in enumerate(unique_users)}
        self.item_mapping = {movie_id: i for i, movie_id in enumerate(unique_movies)}
        
        self.reverse_user_mapping = {i: user_id for user_id, i in self.user_mapping.items()}
        self.reverse_item_mapping = {i: movie_id for movie_id, i in self.item_mapping.items()}
        
        # Create rating matrix (users x items)
        n_users = len(unique_users)
        n_items = len(unique_movies)
        
        # Initialize rating matrix
        rating_matrix = np.zeros((n_users, n_items))
        
        # Fill rating matrix
        for _, row in self.ratings_df.iterrows():
            user_idx = self.user_mapping[row['userId']]
            item_idx = self.item_mapping[row['movieId']]
            rating_matrix[user_idx, item_idx] = row['rating']
        
        return rating_matrix
    
    def train(self, ratings_df=None, n_components=100):
        """
        Train the collaborative filtering model using SVD
        """
        rating_matrix = self.prepare_data(ratings_df)
        
        print("Training collaborative filtering model...")
        # Apply SVD
        self.model = TruncatedSVD(n_components=n_components, random_state=42)
        self.item_factors = self.model.fit_transform(rating_matrix.T)
        
        # Calculate user factors
        self.user_factors = rating_matrix @ self.item_factors @ np.linalg.pinv(
            np.diag(self.model.singular_values_)
        )
        
        self.is_trained = True
        print("Training complete!")
        
    def predict(self, user_id, movie_id):
        """
        Predict the rating for a given user and movie
        """
        if not self.is_trained:
            raise ValueError("Model is not trained yet")
            
        # Convert user_id and movie_id to internal indices
        if user_id not in self.user_mapping or movie_id not in self.item_mapping:
            # Return average rating if user or movie not in training data
            return 3.0
            
        user_idx = self.user_mapping[user_id]
        movie_idx = self.item_mapping[movie_id]
        
        # Get user and item factors
        user_vec = self.user_factors[user_idx]
        item_vec = self.item_factors[movie_idx]
        
        # Make prediction
        prediction = np.dot(user_vec, item_vec) / np.linalg.norm(user_vec) / np.linalg.norm(item_vec)
        
        # Scale prediction to match rating scale
        scaled_prediction = (prediction + 1) * 2.5
        
        # Clip prediction to rating range
        return max(0.5, min(5.0, scaled_prediction))
    
    def get_top_n_recommendations(self, user_id, movie_list, n=5):
        """
        Get the top N movie recommendations for a given user
        """
        if not self.is_trained:
            raise ValueError("Model is not trained yet")
            
        # Predict ratings for all movies in the list
        predictions = []
        for movie_id in movie_list:
            pred_rating = self.predict(user_id, movie_id)
            predictions.append((movie_id, pred_rating))
            
        # Sort predictions by rating (descending)
        predictions.sort(key=lambda x: x[1], reverse=True)
        
        # Return top N movie IDs
        top_n_movie_ids = [movie_id for movie_id, _ in predictions[:n]]
        return top_n_movie_ids
    
    def save_model(self, filepath='models/cf_model.pkl'):
        """
        Save the model to a file
        """
        if not self.is_trained:
            raise ValueError("Cannot save untrained model")
            
        directory = os.path.dirname(filepath)
        if directory and not os.path.exists(directory):
            os.makedirs(directory)
            
        model_data = {
            'user_factors': self.user_factors,
            'item_factors': self.item_factors,
            'user_mapping': self.user_mapping,
            'item_mapping': self.item_mapping,
            'reverse_user_mapping': self.reverse_user_mapping,
            'reverse_item_mapping': self.reverse_item_mapping,
            'singular_values': self.model.singular_values_ if self.model else None
        }
            
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
        
    def load_model(self, filepath='models/cf_model.pkl'):
        """
        Load the model from a file
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
            
        self.user_factors = model_data['user_factors']
        self.item_factors = model_data['item_factors']
        self.user_mapping = model_data['user_mapping']
        self.item_mapping = model_data['item_mapping']
        self.reverse_user_mapping = model_data['reverse_user_mapping']
        self.reverse_item_mapping = model_data['reverse_item_mapping']
        
        # Recreate the SVD model
        self.model = TruncatedSVD(n_components=self.user_factors.shape[1], random_state=42)
        if 'singular_values' in model_data and model_data['singular_values'] is not None:
            self.model.singular_values_ = model_data['singular_values']
            
        self.is_trained = True
        print(f"Model loaded from {filepath}")

# Create directory for models if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Define model file path
cf_model_path = 'models/cf_model.pkl'

# Create and train a collaborative filtering model
cf_model = CollaborativeFiltering(train_data)
cf_model.train(n_components=50)
cf_model.save_model(cf_model_path)

Training collaborative filtering model...
Training complete!
Model saved to models/cf_model.pkl


Content Based Filtering

In [7]:
class ContentBasedFiltering:
    def __init__(self, movies_df=None, genre_df=None):
        self.movies_df = movies_df
        self.genre_df = genre_df
        self.similarity_matrix = None
        self.feature_names = None
    
    def prepare_data(self, movies_df=None, genre_df=None):
        """
        Prepare data for content-based filtering
        """
        if movies_df is not None:
            self.movies_df = movies_df
        if genre_df is not None:
            self.genre_df = genre_df
            
        if self.movies_df is None or self.genre_df is None:
            raise ValueError("Movies dataframe and genre dataframe are required")
        
        # Create a copy of the genre dataframe without the movieId column
        genre_features = self.genre_df.drop(columns=['movieId'])
        
        # Store feature names
        self.feature_names = genre_features.columns.tolist()
        
        # Include movie year in features (normalized)
        if 'year' in self.movies_df.columns:
            year_min = self.movies_df['year'].min()
            year_max = self.movies_df['year'].max()
            normalized_year = (self.movies_df['year'] - year_min) / (year_max - year_min)
            genre_features['year'] = normalized_year.values
            self.feature_names.append('year')
            
        # Include average rating in features (normalized)
        if 'avg_rating' in self.movies_df.columns:
            rating_min = self.movies_df['avg_rating'].min()
            rating_max = self.movies_df['avg_rating'].max()
            if rating_max > rating_min:
                normalized_rating = (self.movies_df['avg_rating'] - rating_min) / (rating_max - rating_min)
                genre_features['avg_rating'] = normalized_rating.values
                self.feature_names.append('avg_rating')
                
        return genre_features
    
    def train(self, movies_df=None, genre_df=None):
        """
        Train the content-based filtering model
        """
        genre_features = self.prepare_data(movies_df, genre_df)
        
        print("Computing similarity matrix for content-based filtering...")
        self.similarity_matrix = cosine_similarity(genre_features)
        print("Similarity matrix computation complete!")
        
    def get_similar_movies(self, movie_idx, n=10):
        """
        Get similar movies for a given movie index
        """
        if self.similarity_matrix is None:
            raise ValueError("Model is not trained yet")
            
        # Get similarity scores for the movie
        movie_similarities = self.similarity_matrix[movie_idx]
        
        # Get the indices of the top N similar movies (excluding itself)
        similar_indices = np.argsort(movie_similarities)[::-1][1:n+1]
        
        # Get the movieIds for these indices
        similar_movie_ids = self.movies_df.iloc[similar_indices]['movieId'].tolist()
        similarity_scores = movie_similarities[similar_indices].tolist()
        
        return list(zip(similar_movie_ids, similarity_scores))
    
    def get_recommendations_by_genres(self, genre_preferences, n=5):
        """
        Get movie recommendations based on genre preferences
        genre_preferences: dict of genres and their weights
        """
        if self.similarity_matrix is None:
            raise ValueError("Model is not trained yet")
            
        # If feature_names is not available, extract it from the genre_df
        if self.feature_names is None:
            # Generate feature names from genre_df
            self.feature_names = [col for col in self.genre_df.columns if col != 'movieId']
            
            # Add additional features if they exist in the movies_df
            if 'year' in self.movies_df.columns:
                self.feature_names.append('year')
            if 'avg_rating' in self.movies_df.columns:
                self.feature_names.append('avg_rating')
        
        # Get a list of genres from genre_df (excluding movieId)
        genre_columns = [col for col in self.genre_df.columns if col != 'movieId']
        
        # Create a user profile vector based on genre preferences
        genre_features = self.prepare_data()
        user_profile = np.zeros(genre_features.shape[1])
        
        # Set weights for each genre that exists in both genre_preferences and genre_columns
        for i, col in enumerate(genre_features.columns):
            if col in genre_preferences:
                user_profile[i] = genre_preferences[col]
        
        # Compute similarity between user profile and all movies
        similarities = cosine_similarity([user_profile], genre_features)[0]
        
        # Get top N movies
        movie_indices = np.argsort(similarities)[::-1][:n]
        recommended_movie_ids = self.movies_df.iloc[movie_indices]['movieId'].tolist()
        
        return recommended_movie_ids
    
    def save_model(self, filepath='models/cb_model.pkl'):
        """
        Save the model to a file
        """
        if self.similarity_matrix is None:
            raise ValueError("Cannot save untrained model")
            
        directory = os.path.dirname(filepath)
        if directory and not os.path.exists(directory):
            os.makedirs(directory)
            
        model_data = {
            'similarity_matrix': self.similarity_matrix,
            'feature_names': self.feature_names
        }
            
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
        
    def load_model(self, filepath='models/cb_model.pkl'):
        """
        Load the model from a file
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
            
        self.similarity_matrix = model_data['similarity_matrix']
        
        # Try to load feature_names if available in the model file
        if 'feature_names' in model_data:
            self.feature_names = model_data['feature_names']
        # Otherwise feature_names will be generated in get_recommendations_by_genres
            
        print(f"Model loaded from {filepath}")

# Define model file path
cb_model_path = 'models/cb_model.pkl'

# Create and train a content-based filtering model
cb_model = ContentBasedFiltering(movies, genre_df)
cb_model.train()
cb_model.save_model(cb_model_path)

Computing similarity matrix for content-based filtering...
Similarity matrix computation complete!
Model saved to models/cb_model.pkl


Sentiment Analysis

In [8]:
class SentimentAnalyzer:
    def __init__(self):
        try:
            nltk.data.find('vader_lexicon')
        except LookupError:
            print("Downloading NLTK resources...")
            nltk.download('vader_lexicon')
            
        self.analyzer = SentimentIntensityAnalyzer()
    
    def analyze_text(self, text):
        """
        Analyze sentiment of a text and return a score
        """
        if pd.isna(text) or text == "":
            return 0
            
        sentiment = self.analyzer.polarity_scores(text)
        return sentiment['compound']  # Compound score from -1 (negative) to 1 (positive)
    
    def analyze_tags(self, tags_df):
        """
        Analyze sentiment of movie tags
        """
        if tags_df is None:
            print("No tags data available for sentiment analysis")
            return pd.DataFrame()
            
        print("Analyzing sentiment of movie tags...")
        
        # Group tags by movieId
        grouped_tags = tags_df.groupby('movieId')['tag'].apply(list).reset_index()
        
        # Calculate sentiment for each movie's tags
        sentiment_scores = []
        
        for _, row in grouped_tags.iterrows():
            movie_id = row['movieId']
            tags = row['tag']
            
            # Calculate average sentiment score for all tags
            scores = [self.analyze_text(tag) for tag in tags]
            avg_score = np.mean(scores) if scores else 0
            
            sentiment_scores.append({
                'movieId': movie_id,
                'sentiment_score': avg_score,
                'tag_count': len(tags)
            })
            
        sentiment_df = pd.DataFrame(sentiment_scores)
        print(f"Sentiment analysis complete for {len(sentiment_df)} movies")
        
        return sentiment_df

if __name__ == "__main__":
    from data_preparation import prepare_data
    
    # Load and prepare data
    movies, ratings, tags, genre_df, train_data, test_data = prepare_data()
    
    # Initialize sentiment analyzer
    sentiment_analyzer = SentimentAnalyzer()
    
    # Analyze sentiment of tags if available
    if tags is not None:
        sentiment_df = sentiment_analyzer.analyze_tags(tags)
        
        # Print top 5 movies with positive sentiment
        print("\nTop 5 Movies with Positive Sentiment:")
        top_positive = sentiment_df.sort_values('sentiment_score', ascending=False).head(5)
        for _, row in top_positive.iterrows():
            movie_info = movies[movies['movieId'] == row['movieId']].iloc[0]
            print(f"- {movie_info['title']} (sentiment: {row['sentiment_score']:.2f})")
            
        # Print top 5 movies with negative sentiment
        print("\nTop 5 Movies with Negative Sentiment:")
        top_negative = sentiment_df.sort_values('sentiment_score').head(5)
        for _, row in top_negative.iterrows():
            movie_info = movies[movies['movieId'] == row['movieId']].iloc[0]
            print(f"- {movie_info['title']} (sentiment: {row['sentiment_score']:.2f})")

# Initialize sentiment analyzer and analyze tags
sentiment_analyzer = SentimentAnalyzer()
sentiment_df = None
if tags is not None:
    sentiment_df = sentiment_analyzer.analyze_tags(tags)
    
    # Save sentiment data
    if not os.path.exists('models'):
        os.makedirs('models')
    with open('models/sentiment_data.pkl', 'wb') as f:
        pickle.dump(sentiment_df, f)

Loading and preprocessing data...
Data preparation complete: 9742 movies, 100836 ratings
Downloading NLTK resources...
Analyzing sentiment of movie tags...
Sentiment analysis complete for 1572 movies

Top 5 Movies with Positive Sentiment:
- Casablanca (1942) (sentiment: 0.78)
- People vs. Larry Flynt, The (1996) (sentiment: 0.64)
- Voices of a Distant Star (Hoshi no koe) (2003) (sentiment: 0.64)
- Harold and Maude (1971) (sentiment: 0.56)
- Corrina, Corrina (1994) (sentiment: 0.56)

Top 5 Movies with Negative Sentiment:
- Children of a Lesser God (1986) (sentiment: -0.78)
- Widow of St. Pierre, The (Veuve de Saint-Pierre, La) (2000) (sentiment: -0.78)
- Strangers on a Train (1951) (sentiment: -0.69)
- Accused, The (1988) (sentiment: -0.69)
- Arsenic and Old Lace (1944) (sentiment: -0.69)
Downloading NLTK resources...
Analyzing sentiment of movie tags...


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment analysis complete for 1572 movies


Hybrid Recommender

In [9]:
class HybridRecommender:
    def __init__(self, cf_weight=0.6, cb_weight=0.4, sentiment_weight=0.1):
        """
        Initialize the hybrid recommender
        cf_weight: weight for collaborative filtering recommendations
        cb_weight: weight for content-based filtering recommendations
        sentiment_weight: weight for sentiment analysis adjustment
        """
        self.cf_weight = cf_weight
        self.cb_weight = cb_weight
        self.sentiment_weight = sentiment_weight
        
        self.movies_df = None
        self.cf_model = None
        self.cb_model = None
        self.sentiment_df = None
        
    def load_data(self, movies_df, cf_model, cb_model, sentiment_df=None):
        """
        Load all necessary data and models
        """
        self.movies_df = movies_df
        self.cf_model = cf_model
        self.cb_model = cb_model
        self.sentiment_df = sentiment_df
        
    def get_recommendations(self, user_id, genre_preferences=None, n=5):
        """
        Get hybrid recommendations for a user
        user_id: user ID for collaborative filtering
        genre_preferences: dict of genres and their weights for content-based filtering
        n: number of recommendations to return
        """
        if self.movies_df is None or self.cf_model is None or self.cb_model is None:
            raise ValueError("Data and models must be loaded first")
            
        # Get list of all movie IDs
        all_movie_ids = self.movies_df['movieId'].values
        
        # Get collaborative filtering recommendations
        cf_recommendations = self.cf_model.get_top_n_recommendations(
            user_id, all_movie_ids, n=n*2)
        
        # Get content-based recommendations if genre preferences are provided
        if genre_preferences:
            cb_recommendations = self.cb_model.get_recommendations_by_genres(
                genre_preferences, n=n*2)
        else:
            # Use the user's highest-rated movies to get content-based recs
            # This requires access to ratings data
            # For simplicity, use the same as CF recommendations
            cb_recommendations = cf_recommendations
        
        # Combine recommendations with weights
        movie_scores = {}
        
        # Add collaborative filtering scores
        for i, movie_id in enumerate(cf_recommendations):
            score = self.cf_weight * (1.0 - i/(n*2))
            if movie_id in movie_scores:
                movie_scores[movie_id] += score
            else:
                movie_scores[movie_id] = score
                
        # Add content-based filtering scores
        for i, movie_id in enumerate(cb_recommendations):
            score = self.cb_weight * (1.0 - i/(n*2))
            if movie_id in movie_scores:
                movie_scores[movie_id] += score
            else:
                movie_scores[movie_id] = score
                
        # Apply sentiment adjustment if available
        if self.sentiment_df is not None and self.sentiment_weight > 0:
            for movie_id in list(movie_scores.keys()):
                sentiment_info = self.sentiment_df[self.sentiment_df['movieId'] == movie_id]
                if not sentiment_info.empty:
                    sentiment_score = sentiment_info.iloc[0]['sentiment_score']
                    # Apply sentiment adjustment
                    movie_scores[movie_id] += self.sentiment_weight * sentiment_score
        
        # Sort movies by score (descending) and get top N
        sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:n]
        
        # Get the final list of movie IDs
        recommended_movie_ids = [movie_id for movie_id, _ in sorted_movies]
        
        # Get movie details for the recommended movies
        recommendations = []
        for movie_id in recommended_movie_ids:
            movie_info = self.movies_df[self.movies_df['movieId'] == movie_id].iloc[0]
            recommendations.append({
                'movieId': movie_id,
                'title': movie_info['title'],
                'genres': movie_info['genres'],
                'year': movie_info['year'] if 'year' in movie_info else None,
                'avg_rating': movie_info['avg_rating'] if 'avg_rating' in movie_info else None,
            })
            
        return recommendations

if __name__ == "__main__":
    import os
    import pickle
    from data_preparation import prepare_data
    from collaborative_filtering import CollaborativeFiltering
    from content_based_filtering import ContentBasedFiltering
    from sentiment_analysis import SentimentAnalyzer
    
    # Load and prepare data
    movies, ratings, tags, genre_df, train_data, test_data = prepare_data()
    
    # Load or train collaborative filtering model
    cf_model_path = 'models/cf_model.pkl'
    if os.path.exists(cf_model_path):
        cf_model = CollaborativeFiltering()
        cf_model.load_model(cf_model_path)
    else:
        cf_model = CollaborativeFiltering(train_data)
        cf_model.train()
        cf_model.save_model()
        
    # Load or train content-based filtering model
    cb_model_path = 'models/cb_model.pkl'
    if os.path.exists(cb_model_path):
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.load_model(cb_model_path)
    else:
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.train()
        cb_model.save_model()
        
    # Generate sentiment data if tags are available
    sentiment_df = None
    if tags is not None:
        sentiment_analyzer = SentimentAnalyzer()
        sentiment_df = sentiment_analyzer.analyze_tags(tags)
        
    # Create hybrid recommender
    hybrid_recommender = HybridRecommender(
        cf_weight=0.6, 
        cb_weight=0.4, 
        sentiment_weight=0.2 if sentiment_df is not None else 0
    )
    
    hybrid_recommender.load_data(movies, cf_model, cb_model, sentiment_df)
    
    # Test with a sample user and genre preferences
    sample_user_id = 1
    sample_genre_prefs = {
        'Action': 0.8,
        'Adventure': 0.6,
        'Sci-Fi': 0.7
    }
    
    recommendations = hybrid_recommender.get_recommendations(
        sample_user_id, 
        genre_preferences=sample_genre_prefs, 
        n=5
    )
    
    print(f"\nHybrid Recommendations for User {sample_user_id}:")
    for i, rec in enumerate(recommendations):
        print(f"{i+1}. {rec['title']} ({rec['year']}) - {rec['genres']}")

# Create hybrid recommender
hybrid_recommender = HybridRecommender(
    cf_weight=0.6, 
    cb_weight=0.4, 
    sentiment_weight=0.2 if sentiment_df is not None else 0
)

hybrid_recommender.load_data(movies, cf_model, cb_model, sentiment_df)

# Test with a sample user and genre preferences
sample_user_id = 1
sample_genre_prefs = {
    'Action': 0.8,
    'Adventure': 0.6,
    'Sci-Fi': 0.7
}

recommendations = hybrid_recommender.get_recommendations(
    sample_user_id, 
    genre_preferences=sample_genre_prefs, 
    n=10
)

print(f"Hybrid Recommendations for User {sample_user_id} with Genre Preferences:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec['title']} ({rec['year']}) - {rec['genres']}")

Loading and preprocessing data...
Data preparation complete: 9742 movies, 100836 ratings
Model loaded from models/cf_model.pkl
Model loaded from models/cb_model.pkl
Downloading NLTK resources...
Analyzing sentiment of movie tags...


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment analysis complete for 1572 movies

Hybrid Recommendations for User 1:
1. Shaft (1971) (1971) - Action,Crime,Drama,Thriller
2. Best Men (1997) (1997) - Action,Comedy,Crime,Drama
3. Full Metal Jacket (1987) (1987) - Drama,War
4. Indiana Jones and the Last Crusade (1989) (1989) - Action,Adventure
5. 3 dev adam (Three Giant Men) (1973)  (1973) - Action,Adventure,Sci-Fi
Hybrid Recommendations for User 1 with Genre Preferences:
1. Shaft (1971) (1971) - Action,Crime,Drama,Thriller
2. Best Men (1997) (1997) - Action,Comedy,Crime,Drama
3. Full Metal Jacket (1987) (1987) - Drama,War
4. Indiana Jones and the Last Crusade (1989) (1989) - Action,Adventure
5. Highlander (1986) (1986) - Action,Adventure,Fantasy
6. Face/Off (1997) (1997) - Action,Crime,Drama,Thriller
7. 3 dev adam (Three Giant Men) (1973)  (1973) - Action,Adventure,Sci-Fi
8. Indiana Jones and the Temple of Doom (1984) (1984) - Action,Adventure,Fantasy
9. Double Dragon (1994) (1994) - Action,Adventure,Sci-Fi
10. Star Wars: Ep

Streamlit app

In [None]:
import streamlit as st
import pickle
from data_preparation import prepare_data
from collaborative_filtering import CollaborativeFiltering
from content_based_filtering import ContentBasedFiltering
from sentiment_analysis import SentimentAnalyzer
from hybrid_recommender import HybridRecommender

# Set page title and config
st.set_page_config(
    page_title="Movie Recommender System",
    page_icon="🎬",
    layout="wide"
)

@st.cache_resource
def load_data_and_models():
    """
    Load data and models (cached for better performance)
    """
    # Load and prepare data
    movies, ratings, tags, genre_df, train_data, test_data = prepare_data()
    
    # Load or train collaborative filtering model
    cf_model_path = 'models/cf_model.pkl'
    if os.path.exists(cf_model_path):
        cf_model = CollaborativeFiltering()
        cf_model.load_model(cf_model_path)
    else:
        cf_model = CollaborativeFiltering(train_data)
        cf_model.train()
        cf_model.save_model()
        
    # Load or train content-based filtering model
    cb_model_path = 'models/cb_model.pkl'
    if os.path.exists(cb_model_path):
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.load_model(cb_model_path)
    else:
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.train()
        cb_model.save_model()
        
    # Generate sentiment data if tags are available
    sentiment_df = None
    if tags is not None:
        sentiment_analyzer = SentimentAnalyzer()
        sentiment_df = sentiment_analyzer.analyze_tags(tags)
    
    # Extract list of genres
    all_genres = []
    for genres in movies['genres'].str.split(','):
        if isinstance(genres, list):
            all_genres.extend(genres)
    unique_genres = sorted(list(set(all_genres)))
    if '(no genres listed)' in unique_genres:
        unique_genres.remove('(no genres listed)')
    
    return movies, ratings, tags, genre_df, cf_model, cb_model, sentiment_df, unique_genres

def main():
    # Title and description
    st.title("🎬 Movie Recommendation System")
    st.write("Get personalized movie recommendations based on your preferences")
    
    # Load data and models
    with st.spinner("Loading data and models..."):
        movies, ratings, tags, genre_df, cf_model, cb_model, sentiment_df, unique_genres = load_data_and_models()
    
    # Create hybrid recommender
    hybrid_recommender = HybridRecommender(
        cf_weight=0.6, 
        cb_weight=0.4, 
        sentiment_weight=0.2 if sentiment_df is not None else 0
    )
    
    hybrid_recommender.load_data(movies, cf_model, cb_model, sentiment_df)
    
    # Sidebar for user input
    st.sidebar.header("Your Preferences")
    
    # User selection
    st.sidebar.subheader("User ID")
    user_id = st.sidebar.number_input("Enter your user ID", min_value=1, max_value=int(ratings['userId'].max()), value=1)
    
    # Genre preferences
    st.sidebar.subheader("Genre Preferences")
    st.sidebar.write("Select your preferred genres and rate them from 0 to 1")
    
    genre_preferences = {}
    for genre in unique_genres:
        if st.sidebar.checkbox(genre, key=f"genre_{genre}"):
            weight = st.sidebar.slider(f"{genre} preference", 0.0, 1.0, 0.5, 0.1, key=f"weight_{genre}")
            genre_preferences[genre] = weight
    
    # Model weight adjustment
    st.sidebar.subheader("Model Weights")
    cf_weight = st.sidebar.slider("Collaborative Filtering Weight", 0.0, 1.0, 0.6, 0.1)
    cb_weight = st.sidebar.slider("Content-Based Filtering Weight", 0.0, 1.0, 0.4, 0.1)
    sentiment_weight = st.sidebar.slider("Sentiment Weight", 0.0, 1.0, 0.2, 0.1) if sentiment_df is not None else 0.0
    
    # Update model weights
    hybrid_recommender.cf_weight = cf_weight
    hybrid_recommender.cb_weight = cb_weight
    hybrid_recommender.sentiment_weight = sentiment_weight
    
    # Number of recommendations
    num_recommendations = st.sidebar.slider("Number of Recommendations", 1, 20, 5)
    
    # Submit button
    if st.sidebar.button("Get Recommendations"):
        with st.spinner("Generating recommendations..."):
            # Get recommendations
            recommendations = hybrid_recommender.get_recommendations(
                user_id, 
                genre_preferences=genre_preferences if genre_preferences else None, 
                n=num_recommendations
            )
            
            # Display recommendations
            st.subheader("Your Personalized Movie Recommendations")
            
            # Create columns for recommendations
            cols = st.columns(min(5, num_recommendations))
            
            for i, rec in enumerate(recommendations):
                col_idx = i % len(cols)
                with cols[col_idx]:
                    st.write(f"**{i+1}. {rec['title']}**")
                    st.write(f"Genres: {rec['genres']}")
                    if 'year' in rec and rec['year'] > 0:
                        st.write(f"Year: {rec['year']}")
                    if 'avg_rating' in rec and rec['avg_rating'] > 0:
                        st.write(f"Average Rating: {rec['avg_rating']:.1f}/5.0")
                    st.write("---")
    
    # Display some popular movies if no recommendations generated yet
    else:
        st.subheader("Some Popular Movies")
        popular_movies = movies.sort_values('rating_count', ascending=False).head(10)
        
        cols = st.columns(5)
        for i, (_, movie) in enumerate(popular_movies.iterrows()):
            col_idx = i % 5
            with cols[col_idx]:
                st.write(f"**{movie['title']}**")
                st.write(f"Genres: {movie['genres']}")
                if 'year' in movie and movie['year'] > 0:
                    st.write(f"Year: {movie['year']}")
                if 'avg_rating' in movie and movie['avg_rating'] > 0:
                    st.write(f"Average Rating: {movie['avg_rating']:.1f}/5.0")
                if 'rating_count' in movie:
                    st.write(f"Ratings: {movie['rating_count']}")
                st.write("---")

if __name__ == "__main__":
    main()
z
# Export the Streamlit app to a file
with open('streamlit_app.py', 'w') as f:
    f.write('''
import streamlit as st
import pandas as pd
import numpy as np
import os
import pickle
from data_preparation import prepare_data
from collaborative_filtering import CollaborativeFiltering
from content_based_filtering import ContentBasedFiltering
from sentiment_analysis import SentimentAnalyzer
from hybrid_recommender import HybridRecommender

# Set page title and config
st.set_page_config(
    page_title="Movie Recommender System",
    layout="wide"
)

@st.cache_resource
def load_data_and_models():
    """
    Load data and models (cached for better performance)
    """
    # Load and prepare data
    movies, ratings, tags, genre_df, train_data, test_data = prepare_data()
    
    # Load or train collaborative filtering model
    cf_model_path = 'models/cf_model.pkl'
    if os.path.exists(cf_model_path):
        cf_model = CollaborativeFiltering()
        cf_model.load_model(cf_model_path)
    else:
        cf_model = CollaborativeFiltering(train_data)
        cf_model.train()
        cf_model.save_model()
        
    # Load or train content-based filtering model
    cb_model_path = 'models/cb_model.pkl'
    if os.path.exists(cb_model_path):
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.load_model(cb_model_path)
    else:
        cb_model = ContentBasedFiltering(movies, genre_df)
        cb_model.train()
        cb_model.save_model()
        
    # Generate sentiment data if tags are available
    sentiment_df = None
    if tags is not None:
        sentiment_analyzer = SentimentAnalyzer()
        sentiment_df = sentiment_analyzer.analyze_tags(tags)
    
    # Extract list of genres
    all_genres = []
    for genres in movies['genres'].str.split(','):
        if isinstance(genres, list):
            all_genres.extend(genres)
    unique_genres = sorted(list(set(all_genres)))
    if '(no genres listed)' in unique_genres:
        unique_genres.remove('(no genres listed)')
    
    return movies, ratings, tags, genre_df, cf_model, cb_model, sentiment_df, unique_genres

def main():
    # Title and description
    st.title("🎬 Movie Recommendation System")
    st.write("Get personalized movie recommendations based on your preferences")
    
    # Load data and models
    with st.spinner("Loading data and models..."):
        movies, ratings, tags, genre_df, cf_model, cb_model, sentiment_df, unique_genres = load_data_and_models()
    
    # Create hybrid recommender
    hybrid_recommender = HybridRecommender(
        cf_weight=0.6, 
        cb_weight=0.4, 
        sentiment_weight=0.2 if sentiment_df is not None else 0
    )
    
    hybrid_recommender.load_data(movies, cf_model, cb_model, sentiment_df)
    
    # Sidebar for user input
    st.sidebar.header("Your Preferences")
    
    # User selection
    st.sidebar.subheader("User ID")
    user_id = st.sidebar.number_input("Enter your user ID", min_value=1, max_value=int(ratings['userId'].max()), value=1)
    
    # Genre preferences
    st.sidebar.subheader("Genre Preferences")
    st.sidebar.write("Select your preferred genres and rate them from 0 to 1")
    
    genre_preferences = {}
    for genre in unique_genres:
        if st.sidebar.checkbox(genre, key=f"genre_{genre}"):
            weight = st.sidebar.slider(f"{genre} preference", 0.0, 1.0, 0.5, 0.1, key=f"weight_{genre}")
            genre_preferences[genre] = weight
    
    # Model weight adjustment
    st.sidebar.subheader("Model Weights")
    cf_weight = st.sidebar.slider("Collaborative Filtering Weight", 0.0, 1.0, 0.6, 0.1)
    cb_weight = st.sidebar.slider("Content-Based Filtering Weight", 0.0, 1.0, 0.4, 0.1)
    sentiment_weight = st.sidebar.slider("Sentiment Weight", 0.0, 1.0, 0.2, 0.1) if sentiment_df is not None else 0.0
    
    # Update model weights
    hybrid_recommender.cf_weight = cf_weight
    hybrid_recommender.cb_weight = cb_weight
    hybrid_recommender.sentiment_weight = sentiment_weight
    
    # Number of recommendations
    num_recommendations = st.sidebar.slider("Number of Recommendations", 1, 20, 5)
    
    # Submit button
    if st.sidebar.button("Get Recommendations"):
        with st.spinner("Generating recommendations..."):
            # Get recommendations
            recommendations = hybrid_recommender.get_recommendations(
                user_id, 
                genre_preferences=genre_preferences if genre_preferences else None, 
                n=num_recommendations
            )
            
            # Display recommendations
            st.subheader("Your Personalized Movie Recommendations")
            
            # Create columns for recommendations
            cols = st.columns(min(5, num_recommendations))
            
            for i, rec in enumerate(recommendations):
                col_idx = i % len(cols)
                with cols[col_idx]:
                    st.write(f"**{i+1}. {rec['title']}**")
                    st.write(f"Genres: {rec['genres']}")
                    if 'year' in rec and rec['year'] > 0:
                        st.write(f"Year: {rec['year']}")
                    if 'avg_rating' in rec and rec['avg_rating'] > 0:
                        st.write(f"Average Rating: {rec['avg_rating']:.1f}/5.0")
                    st.write("---")
    
    # Display some popular movies if no recommendations generated yet
    else:
        st.subheader("Some Popular Movies")
        popular_movies = movies.sort_values('rating_count', ascending=False).head(10)
        
        cols = st.columns(5)
        for i, (_, movie) in enumerate(popular_movies.iterrows()):
            col_idx = i % 5
            with cols[col_idx]:
                st.write(f"**{movie['title']}**")
                st.write(f"Genres: {movie['genres']}")
                if 'year' in movie and movie['year'] > 0:
                    st.write(f"Year: {movie['year']}")
                if 'avg_rating' in movie and movie['avg_rating'] > 0:
                    st.write(f"Average Rating: {movie['avg_rating']:.1f}/5.0")
                if 'rating_count' in movie:
                    st.write(f"Ratings: {movie['rating_count']}")
                st.write("---")

if __name__ == "__main__":
    main()
''')

print("Streamlit app exported to streamlit_app.py")
print("To run the app, execute: streamlit run streamlit_app.py")



Streamlit app exported to streamlit_app.py
To run the app, execute: streamlit run streamlit_app.py
