In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [102]:
movies = pd.read_csv(r"C:\Users\Surface Laptop 3\OneDrive\Documents\DataScience\Alogorithm movie_recommender\ml-latest-small\movies.csv")

In [103]:
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [104]:
#pick_movies('drama',2000,1)

In [105]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter

class MovieDataProcessor:
    """Processes movie data into features for recommendation"""
    
    def __init__(self, movies_df):
        self.movies = movies_df.copy()
        self._prepare_data()
    
    def _prepare_data(self):
        """Extract and prepare all features"""
        # 1. Extract year from title
        self.movies['year'] = self.movies['title'].apply(self._extract_year)
        
        # 2. Extract clean title (without year)
        self.movies['title_clean'] = self.movies['title'].apply(self._clean_title)
        
        # 3. Create genre matrix (one-hot encoding)
        self.genre_matrix = self.movies['genres'].str.get_dummies(sep='|')
        
        # 4. Create decade feature (1970s, 1980s, etc.)
        self.movies['decade'] = (self.movies['year'] // 10) * 10
        
        # 5. Get all unique genres
        self.all_genres = list(self.genre_matrix.columns)
        
        # 6. Calculate movie popularity (from ratings - if available)
        self._add_popularity()
    
    def _extract_year(self, title):
        """Extract year from title like 'Movie Name (1995)'"""
        match = re.search(r'\((\d{4})\)', title)
        return int(match.group(1)) if match else 1900
    
    def _clean_title(self, title):
        """Remove year from title"""
        return re.sub(r'\s*\(\d{4}\)\s*$', '', title)
    
    def _add_popularity(self):
        """Add popularity score if ratings data available"""
        # Placeholder - we'll enhance this if you have ratings
        self.movies['popularity'] = 1.0  # Default
    
    def get_movie_vector(self, movie_id):
        """Get complete feature vector for a movie"""
        if movie_id not in self.movies['movieId'].values:
            return None
        
        # Get genre vector
        idx = self.movies[self.movies['movieId'] == movie_id].index[0]
        genre_vec = self.genre_matrix.iloc[idx].values
        
        # Normalize year to 0-1 scale
        year_min = self.movies['year'].min()
        year_max = self.movies['year'].max()
        year = self.movies.loc[idx, 'year']
        year_norm = (year - year_min) / (year_max - year_min) if year_max > year_min else 0.5
        
        # Combine features
        return np.concatenate([genre_vec, [year_norm]])
    
    def find_movie_by_title(self, title_query, exact_match=False):
        """Find movie ID by title (fuzzy matching)"""
        if exact_match:
            matches = self.movies[self.movies['title'] == title_query]
        else:
            # Case-insensitive partial match
            matches = self.movies[
                self.movies['title'].str.contains(title_query, case=False, na=False)
            ]
        
        if len(matches) == 0:
            # Try cleaned title
            matches = self.movies[
                self.movies['title_clean'].str.contains(title_query, case=False, na=False)
            ]
        
        return matches[['movieId', 'title', 'year', 'genres']] if not matches.empty else None

In [106]:
class UserProfile:
    """Represents a user's movie preferences"""
    
    def __init__(self, data_processor):
        self.data_processor = data_processor
        self.liked_movies = []  # List of (movie_id, weight)
        self.disliked_movies = []  # List of (movie_id, weight)
        self.watched_movies = set()  # All movies user has interacted with
        
    def add_preference(self, movie_id, liked=True, weight=1.0, rating=None):
        """Add a movie to user's preferences"""
        if rating is not None:
            # Convert 1-5 rating to weight (1-2: dislike, 4-5: like)
            weight = (rating - 3) / 2  # Maps 1‚Üí-1, 3‚Üí0, 5‚Üí1
        
        if liked:
            self.liked_movies.append((movie_id, weight))
        else:
            self.disliked_movies.append((movie_id, weight))
        
        self.watched_movies.add(movie_id)
    
    def get_profile_vector(self):
        """Calculate user's preference vector"""
        if not self.liked_movies and not self.disliked_movies:
            return None
        
        total_weight = 0
        profile_vector = None
        
        # Process liked movies
        for movie_id, weight in self.liked_movies:
            movie_vec = self.data_processor.get_movie_vector(movie_id)
            if movie_vec is not None:
                if profile_vector is None:
                    profile_vector = np.zeros_like(movie_vec)
                profile_vector += movie_vec * weight
                total_weight += abs(weight)
        
        # Process disliked movies (subtract from profile)
        for movie_id, weight in self.disliked_movies:
            movie_vec = self.data_processor.get_movie_vector(movie_id)
            if movie_vec is not None:
                if profile_vector is None:
                    profile_vector = np.zeros_like(movie_vec)
                profile_vector -= movie_vec * weight
                total_weight += abs(weight)
        
        if profile_vector is not None and total_weight > 0:
            profile_vector = profile_vector / total_weight
        
        return profile_vector
    
    def get_preferred_genres(self, top_n=5):
        """Get user's favorite genres based on liked movies"""
        genre_counter = Counter()
        
        for movie_id, weight in self.liked_movies:
            movie_idx = self.data_processor.movies[
                self.data_processor.movies['movieId'] == movie_id
            ].index[0]
            
            # Get genres for this movie
            genres = self.data_processor.movies.loc[movie_idx, 'genres'].split('|')
            for genre in genres:
                genre_counter[genre] += weight
        
        return [genre for genre, _ in genre_counter.most_common(top_n)]
    
    def get_preferred_decades(self):
        """Get user's preferred movie decades"""
        decade_counter = Counter()
        
        for movie_id, weight in self.liked_movies:
            movie_idx = self.data_processor.movies[
                self.data_processor.movies['movieId'] == movie_id
            ].index[0]
            
            decade = self.data_processor.movies.loc[movie_idx, 'decade']
            decade_counter[decade] += weight
        
        return decade_counter

In [107]:
class MovieRecommender:
    """Main recommendation engine with multiple strategies"""
    
    def __init__(self, data_processor):
        self.dp = data_processor
        self._precompute_similarities()
    
    def _precompute_similarities(self):
        """Precompute movie-to-movie similarities for speed"""
        print("Precomputing similarity matrix...")
        
        # Create enhanced feature matrix (genres + year)
        n_movies = len(self.dp.movies)
        n_genres = len(self.dp.all_genres)
        
        # Feature matrix: genres + normalized year
        feature_matrix = np.zeros((n_movies, n_genres + 1))
        
        for i in range(n_movies):
            # Genre features
            feature_matrix[i, :n_genres] = self.dp.genre_matrix.iloc[i].values
            
            # Year feature (normalized 0-1)
            year = self.dp.movies.iloc[i]['year']
            year_min = self.dp.movies['year'].min()
            year_max = self.dp.movies['year'].max()
            feature_matrix[i, n_genres] = (year - year_min) / (year_max - year_min) if year_max > year_min else 0.5
        
        # Compute cosine similarity
        self.similarity_matrix = cosine_similarity(feature_matrix)
        self.feature_matrix = feature_matrix
        
        print(f"Similarity matrix shape: {self.similarity_matrix.shape}")
    
    def recommend_for_user(self, user_profile, top_n=20, 
                           diversity_factor=0.3, exclude_watched=True):
        """
        Generate recommendations for a user profile
        
        Parameters:
        - user_profile: UserProfile object
        - top_n: Number of recommendations
        - diversity_factor: 0-1, higher = more diverse recommendations
        - exclude_watched: Remove movies user has already rated
        """
        # Get user's preference vector
        user_vector = user_profile.get_profile_vector()
        
        if user_vector is None:
            # Cold-start: user has no preferences
            return self._cold_start_recommendations(top_n)
        
        # 1. Calculate similarity between user vector and all movies
        similarities = cosine_similarity([user_vector], self.feature_matrix)[0]
        
        # 2. Apply adjustments
        adjusted_scores = self._adjust_scores(
            similarities, user_profile, diversity_factor
        )
        
        # 3. Get top recommendations
        recommendations = self._get_top_recommendations(
            adjusted_scores, user_profile, top_n, exclude_watched
        )
        
        return recommendations
    
    def _adjust_scores(self, similarities, user_profile, diversity_factor):
        """Adjust similarity scores with various factors"""
        adjusted = similarities.copy()
        
        # Factor 1: Year preference (users often prefer certain decades)
        decade_prefs = user_profile.get_preferred_decades()
        if decade_prefs:
            for i, movie_idx in enumerate(self.dp.movies.index):
                movie_decade = self.dp.movies.loc[movie_idx, 'decade']
                decade_score = decade_prefs.get(movie_decade, 0) / max(decade_prefs.values())
                adjusted[i] += 0.1 * decade_score  # Small bonus
        
        # Factor 2: Diversity (avoid recommending only very similar movies)
        if diversity_factor > 0:
            adjusted = self._apply_diversity(adjusted, diversity_factor)
        
        return adjusted
    
    def _apply_diversity(self, scores, diversity_factor):
        """Promote diversity in recommendations"""
        # Simple approach: penalize movies too similar to each other
        n_movies = len(scores)
        diversified = scores.copy()
        
        # Sort movies by original score
        sorted_indices = np.argsort(scores)[::-1]
        
        for rank, idx in enumerate(sorted_indices):
            if rank > 0:
                # Find similarity to higher-ranked movies
                max_similarity = 0
                for higher_idx in sorted_indices[:rank]:
                    sim = self.similarity_matrix[idx, higher_idx]
                    max_similarity = max(max_similarity, sim)
                
                # Penalize based on similarity to already-selected movies
                penalty = max_similarity * diversity_factor
                diversified[idx] *= (1 - penalty)
        
        return diversified
    
    def _get_top_recommendations(self, scores, user_profile, top_n, exclude_watched):
        """Get top movies with explanations"""
        # Create DataFrame with scores
        result_df = self.dp.movies.copy()
        result_df['score'] = scores
        
        # Exclude watched movies if requested
        if exclude_watched and user_profile.watched_movies:
            result_df = result_df[~result_df['movieId'].isin(user_profile.watched_movies)]
        
        # Sort by score and get top N
        result_df = result_df.sort_values('score', ascending=False).head(top_n)
        
        # Add explanation for each recommendation
        result_df['explanation'] = result_df.apply(
            lambda row: self._generate_explanation(row, user_profile), 
            axis=1
        )
        
        return result_df[['movieId', 'title', 'year', 'genres', 'score', 'explanation']]
    
    def _generate_explanation(self, movie_row, user_profile):
        """Generate human-readable explanation for why movie is recommended"""
        explanations = []
        
        # Get movie genres
        movie_genres = set(movie_row['genres'].split('|'))
        
        # Compare with user's preferred genres
        preferred_genres = user_profile.get_preferred_genres()
        common_genres = movie_genres.intersection(set(preferred_genres))
        
        if common_genres:
            explanations.append(f"Matches your preferred genres: {', '.join(common_genres)}")
        
        # Check decade preference
        movie_decade = (movie_row['year'] // 10) * 10
        decade_prefs = user_profile.get_preferred_decades()
        
        if decade_prefs and movie_decade in decade_prefs:
            explanations.append(f"From your preferred {movie_decade}s era")
        
        # Similar to specific liked movies
        for liked_id, weight in user_profile.liked_movies[:3]:  # Top 3 liked
            liked_idx = self.dp.movies[self.dp.movies['movieId'] == liked_id].index[0]
            movie_idx = self.dp.movies[self.dp.movies['movieId'] == movie_row['movieId']].index[0]
            
            similarity = self.similarity_matrix[movie_idx, liked_idx]
            if similarity > 0.7:
                liked_title = self.dp.movies.loc[liked_idx, 'title_clean']
                explanations.append(f"Similar to '{liked_title}' ({(similarity*100):.0f}% match)")
                break
        
        return " | ".join(explanations) if explanations else "General recommendation"
    
    def _cold_start_recommendations(self, top_n):
        """Recommendations for new users (no preferences yet)"""
        # Popular movies by genre distribution
        result_df = self.dp.movies.copy()
        
        # Simple popularity heuristic: movies with more genres are often more mainstream
        result_df['genre_count'] = result_df['genres'].apply(lambda x: len(x.split('|')))
        
        # Recent movies get a bonus
        current_year = 2024  # Update as needed
        result_df['year_score'] = 1.0 / (1 + abs(result_df['year'] - current_year) / 10)
        
        result_df['score'] = result_df['genre_count'] * result_df['year_score']
        
        return result_df.sort_values('score', ascending=False).head(top_n)[
            ['movieId', 'title', 'year', 'genres']
        ]
    
    def find_similar_movies(self, movie_id, top_n=10):
        """Find movies similar to a specific movie"""
        if movie_id not in self.dp.movies['movieId'].values:
            return None
        
        movie_idx = self.dp.movies[self.dp.movies['movieId'] == movie_id].index[0]
        
        # Get similarities for this movie
        similarities = self.similarity_matrix[movie_idx]
        
        # Sort and get top N (excluding the movie itself)
        similar_indices = np.argsort(similarities)[::-1][1:top_n+1]
        
        result = self.dp.movies.iloc[similar_indices].copy()
        result['similarity'] = similarities[similar_indices]
        
        return result[['movieId', 'title', 'year', 'genres', 'similarity']]

In [108]:
import streamlit as st
import pandas as pd

class MovieRecommenderApp:
    """Interactive web interface for the recommender"""
    
    def __init__(self, recommender):
        self.recommender = recommender
        self.dp = recommender.dp
        
    def run(self):
        st.set_page_config(page_title="MovieMatch Recommender", layout="wide")
        
        st.title("üé¨ MovieMatch: Intelligent Movie Recommendations")
        st.markdown("---")
        
        # Sidebar for user preferences
        with st.sidebar:
            st.header("Your Preferences")
            
            # Option 1: Quick start with popular movies
            if st.button("Quick Start: Rate Popular Movies"):
                self.quick_start_mode()
            
            # Option 2: Manual movie selection
            st.subheader("Rate Movies You've Seen")
            
            # Movie search
            search_query = st.text_input("Search for a movie:")
            if search_query:
                matches = self.dp.find_movie_by_title(search_query)
                if matches is not None:
                    selected_movie = st.selectbox(
                        "Select movie:", 
                        matches['title'].tolist()
                    )
                    
                    col1, col2 = st.columns(2)
                    with col1:
                        if st.button("üëç Liked"):
                            self.add_preference(selected_movie, liked=True)
                    with col2:
                        if st.button("üëé Disliked"):
                            self.add_preference(selected_movie, liked=False)
        
        # Main content area
        tab1, tab2, tab3 = st.tabs(["Recommendations", "Similar Movies", "Explore"])
        
        with tab1:
            self.show_recommendations()
        
        with tab2:
            self.show_similar_movies()
        
        with tab3:
            self.show_exploration()
    
    def quick_start_mode(self):
        """Show popular movies for quick rating"""
        popular = self.recommender._cold_start_recommendations(20)
        
        st.subheader("Rate these popular movies:")
        
        for _, movie in popular.iterrows():
            cols = st.columns([3, 1, 1])
            cols[0].write(f"**{movie['title']}** ({movie['year']})")
            cols[1].button("üëç", key=f"like_{movie['movieId']}")
            cols[2].button("üëé", key=f"dislike_{movie['movieId']}")
    
    def add_preference(self, movie_title, liked=True):
        """Add a movie preference (simplified - in real app would use session state)"""
        matches = self.dp.find_movie_by_title(movie_title, exact_match=True)
        if matches is not None:
            movie_id = matches.iloc[0]['movieId']
            # In real implementation, store in user profile
            st.success(f"Added {'liked' if liked else 'disliked'} preference for {movie_title}")
    
    def show_recommendations(self):
        """Display personalized recommendations"""
        st.header("Personalized Recommendations")
        
        # Create a sample user profile for demonstration
        sample_user = UserProfile(self.dp)
        
        # Add some sample preferences (in real app, from user input)
        sample_preferences = [
            ("Toy Story", 5),  # (title, rating 1-5)
            ("The Godfather", 4),
            ("Pulp Fiction", 5),
            ("Titanic", 2),  # Low rating = dislike
        ]
        
        for title, rating in sample_preferences:
            matches = self.dp.find_movie_by_title(title)
            if matches is not None:
                movie_id = matches.iloc[0]['movieId']
                sample_user.add_preference(movie_id, rating=rating)
        
        # Get recommendations
        with st.spinner("Finding your perfect movies..."):
            recommendations = self.recommender.recommend_for_user(
                sample_user, top_n=15, diversity_factor=0.2
            )
        
        # Display recommendations
        st.subheader(f"Top {len(recommendations)} Recommendations For You")
        
        for idx, row in recommendations.iterrows():
            with st.expander(f"**{row['title']}** ({row['year']}) | Score: {row['score']:.3f}"):
                st.write(f"**Genres:** {row['genres']}")
                st.write(f"**Why recommended:** {row['explanation']}")
                
                # Similar movies preview
                similar = self.recommender.find_similar_movies(row['movieId'], 3)
                if similar is not None:
                    st.write("**Similar movies:**")
                    for _, sim_movie in similar.iterrows():
                        st.write(f"- {sim_movie['title']} ({(sim_movie['similarity']*100):.0f}% similar)")
    
    def show_similar_movies(self):
        """Find movies similar to a selected movie"""
        st.header("Find Similar Movies")
        
        # Movie selection
        movie_titles = self.dp.movies['title'].tolist()
        selected_movie = st.selectbox("Choose a movie:", movie_titles)
        
        if selected_movie:
            matches = self.dp.find_movie_by_title(selected_movie, exact_match=True)
            if matches is not None:
                movie_id = matches.iloc[0]['movieId']
                
                # Number of recommendations
                n_recs = st.slider("Number of similar movies:", 5, 30, 10)
                
                # Get similar movies
                similar = self.recommender.find_similar_movies(movie_id, n_recs)
                
                if similar is not None:
                    st.subheader(f"Movies similar to '{selected_movie}':")
                    
                    # Display as cards
                    cols = st.columns(3)
                    for idx, (_, movie) in enumerate(similar.iterrows()):
                        with cols[idx % 3]:
                            st.markdown(f"""
                            <div style='padding: 10px; border: 1px solid #ddd; border-radius: 5px; margin: 5px;'>
                                <strong>{movie['title']}</strong><br>
                                <small>{movie['year']} | {movie['genres']}</small><br>
                                <small>Similarity: {(movie['similarity']*100):.0f}%</small>
                            </div>
                            """, unsafe_allow_html=True)
    
    def show_exploration(self):
        """Explore movies by genre and year"""
        st.header("Explore Movies")
        
        col1, col2 = st.columns(2)
        
        with col1:
            # Genre filter
            all_genres = self.dp.all_genres
            selected_genres = st.multiselect("Filter by genres:", all_genres)
        
        with col2:
            # Year range
            min_year = int(self.dp.movies['year'].min())
            max_year = int(self.dp.movies['year'].max())
            year_range = st.slider("Year range:", min_year, max_year, (1990, 2010))
        
        # Apply filters
        filtered_movies = self.dp.movies.copy()
        
        if selected_genres:
            # Filter movies that contain ALL selected genres
            for genre in selected_genres:
                filtered_movies = filtered_movies[
                    filtered_movies['genres'].str.contains(genre, case=False, na=False)
                ]
        
        # Filter by year
        filtered_movies = filtered_movies[
            (filtered_movies['year'] >= year_range[0]) & 
            (filtered_movies['year'] <= year_range[1])
        ]
        
        st.write(f"Found {len(filtered_movies)} movies matching your criteria")
        
        # Display results
        if not filtered_movies.empty:
            st.dataframe(
                filtered_movies[['title', 'year', 'genres']].head(50),
                height=400,
                use_container_width=True
            )

In [110]:
def main():
    """Main function to run the complete system"""
    print("=" * 60)
    print("MOVIE RECOMMENDER SYSTEM - Content-Based with User Profiles")
    print("=" * 60)
    
    # 1. Load data
    print("\n1. Loading data...")
    movies = pd.read_csv('movies.csv')
    print(f"   Loaded {len(movies)} movies")
    
    # 2. Process data
    print("\n2. Processing features...")
    processor = MovieDataProcessor(movies)
    
    # 3. Create recommender
    print("\n3. Building recommendation engine...")
    recommender = MovieRecommender(processor)
    
    # 4. Demo user profile
    print("\n4. Creating demo user profile...")
    user = UserProfile(processor)
    
    # Add some sample preferences
    sample_movies = [
        ("Toy Story", 5),      # Animation, Comedy
        ("The Godfather", 4),  # Crime, Drama
        ("Inception", 5),      # Action, Sci-Fi, Thriller
        ("Titanic", 2),        # Drama, Romance (disliked)
    ]
    
    for title, rating in sample_movies:
        matches = processor.find_movie_by_title(title)
        if matches is not None:
            movie_id = matches.iloc[0]['movieId']
            user.add_preference(movie_id, rating=rating)
    
    print(f"   User likes {len(user.liked_movies)} movies, dislikes {len(user.disliked_movies)}")
    print(f"   Preferred genres: {', '.join(user.get_preferred_genres(3))}")
    
    # 5. Get recommendations
    print("\n5. Generating recommendations...")
    recommendations = recommender.recommend_for_user(user, top_n=10)
    
    print("\n" + "=" * 60)
    print("TOP RECOMMENDATIONS:")
    print("=" * 60)
    
    for idx, row in recommendations.iterrows():
        print(f"\n{row['title']} ({row['year']})")
        print(f"   Score: {row['score']:.3f}")
        print(f"   Genres: {row['genres']}")
        print(f"   Why: {row['explanation']}")
    
    # 6. Test similar movies function
    print("\n" + "=" * 60)
    print("SIMILAR MOVIES TEST (Toy Story):")
    print("=" * 60)
    
    toy_story_id = processor.find_movie_by_title("Toy Story").iloc[0]['movieId']
    similar = recommender.find_similar_movies(toy_story_id, 5)
    
    for idx, row in similar.iterrows():
        print(f"{row['title']} - Similarity: {(row['similarity']*100):.0f}%")
    
    print("\n" + "=" * 60)
    print("System ready! Run the Streamlit app for interactive use.")
    print("=" * 60)
    
    # 7. Option to run Streamlit app
    run_streamlit = input("\nRun Streamlit web app? (y/n): ").lower()
    if run_streamlit == 'y':
        app = MovieRecommenderApp(recommender)
        
        # Note: In practice, you'd run this separately:
        # streamlit run app.py
        print("\nTo run the web app, save the code to app.py and run:")
        print("  streamlit run app.py")
    
    return recommender, processor

main()

MOVIE RECOMMENDER SYSTEM - Content-Based with User Profiles

1. Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'movies.csv'