In [1]:
# Movie Recommendation System
# Building a content-based and collaborative filtering recommendation engine
# Optimized for Google Colab with memory-efficient operations

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Display settings for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("="*60)
print("MOVIE RECOMMENDATION SYSTEM")
print("="*60)

# ============================================
# STEP 1: LOAD THE DATA
# ============================================
# Dataset: Use MovieLens dataset or TMDB dataset from Kaggle
# Download links:
# - MovieLens: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset
# - TMDB: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata

print("\n[1/6] Loading data...")

# Load movies data
movies = pd.read_csv('/content/movie.csv')  # Contains movieId, title, genres

# Load ratings data (for collaborative filtering)
ratings = pd.read_csv('/content/rating.csv')  # Contains userId, movieId, rating, timestamp

print(f"Original Movies dataset shape: {movies.shape}")
print(f"Original Ratings dataset shape: {ratings.shape}")

# ============================================
# MEMORY OPTIMIZATION: Sample data for faster processing
# ============================================
print("\nâš¡ Optimizing dataset for faster processing...")

# Use a sample of ratings for Colab efficiency
# Adjust this number based on your needs: 100000-500000 works well
SAMPLE_SIZE = 200000
if len(ratings) > SAMPLE_SIZE:
    # Sample ratings while maintaining user and movie diversity
    ratings = ratings.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"âœ“ Sampled {SAMPLE_SIZE:,} ratings for efficient processing")
else:
    print(f"âœ“ Using full dataset ({len(ratings):,} ratings)")

print(f"Working with {len(ratings):,} ratings")
print("\nFirst few movies:")
print(movies.head())

# ============================================
# STEP 2: DATA EXPLORATION
# ============================================
print("\n" + "="*60)
print("[2/6] Exploring the data...")
print("="*60)

# Basic statistics
print(f"\nTotal movies: {movies.shape[0]:,}")
print(f"Total ratings (sampled): {ratings.shape[0]:,}")
print(f"Total users: {ratings['userId'].nunique():,}")
print(f"Average rating: {ratings['rating'].mean():.2f}")

# Check for missing values
print("\nMissing values in movies:")
print(movies.isnull().sum())

# Rating distribution
print("\nRating distribution:")
print(ratings['rating'].value_counts().sort_index())

# Most rated movies
movie_ratings_count = ratings.groupby('movieId').size().reset_index(name='rating_count')
movies_with_counts = movies.merge(movie_ratings_count, on='movieId', how='left')
movies_with_counts['rating_count'] = movies_with_counts['rating_count'].fillna(0)
most_rated = movies_with_counts.sort_values('rating_count', ascending=False).head(10)
print("\nTop 10 Most Rated Movies:")
print(most_rated[['title', 'rating_count']])

# ============================================
# STEP 3: DATA PREPROCESSING
# ============================================
print("\n" + "="*60)
print("[3/6] Preprocessing data...")
print("="*60)

# Clean movie titles (remove year if present)
movies['clean_title'] = movies['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)

# Split genres into list
movies['genres_list'] = movies['genres'].str.split('|')

# Create a combined feature for content-based filtering
movies['combined_features'] = movies['genres'].str.replace('|', ' ')

print("Preprocessing complete!")
print(f"\nSample processed data:")
print(movies[['title', 'genres', 'combined_features']].head(3))

# ============================================
# STEP 4: CONTENT-BASED FILTERING
# ============================================
print("\n" + "="*60)
print("[4/6] Building Content-Based Recommendation System...")
print("="*60)

# Create TF-IDF matrix from genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")

# Calculate cosine similarity between movies
print("Calculating cosine similarity... (this may take a moment)")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"âœ“ Cosine Similarity Matrix shape: {cosine_sim.shape}")

# Create a mapping of movie titles to indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_content_based_recommendations(title, n_recommendations=10):
    """
    Get movie recommendations based on content similarity (genres)
    """
    try:
        # Get index of the movie
        idx = indices[title]

        # Get similarity scores for all movies
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort movies by similarity score
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top n similar movies (excluding the movie itself)
        sim_scores = sim_scores[1:n_recommendations+1]

        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return recommended movies with similarity scores
        recommendations = movies.iloc[movie_indices][['title', 'genres']].copy()
        recommendations['similarity_score'] = [score[1] for score in sim_scores]

        return recommendations

    except KeyError:
        return f"Movie '{title}' not found in database."

# Test content-based recommendations
test_movie = movies['title'].iloc[0]
print(f"\nContent-Based Recommendations for: '{test_movie}'")
print(get_content_based_recommendations(test_movie, 5))

# ============================================
# STEP 5: COLLABORATIVE FILTERING (OPTIMIZED)
# ============================================
print("\n" + "="*60)
print("[5/6] Building Collaborative Filtering System...")
print("="*60)

# Create user-movie rating matrix
print("Creating user-movie matrix (optimized for Colab)...")

# Filter movies with at least 30 ratings (lowered threshold for sampled data)
MIN_RATINGS = 30
popular_movies = movie_ratings_count[movie_ratings_count['rating_count'] >= MIN_RATINGS]['movieId']
print(f"âœ“ Found {len(popular_movies):,} movies with at least {MIN_RATINGS} ratings")

filtered_ratings = ratings[ratings['movieId'].isin(popular_movies)]

# Further optimize: limit to top active users
MAX_USERS = 3000
top_users = filtered_ratings['userId'].value_counts().head(MAX_USERS).index
filtered_ratings = filtered_ratings[filtered_ratings['userId'].isin(top_users)]

print(f"âœ“ Using top {MAX_USERS:,} active users")
print(f"âœ“ Filtered to {len(filtered_ratings):,} ratings")

# Create pivot table: users as rows, movies as columns
print("Creating pivot table... (this will take 1-2 minutes)")
user_movie_matrix = filtered_ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

print(f"âœ“ User-Movie Matrix shape: {user_movie_matrix.shape}")

# Calculate movie-movie similarity based on user ratings
print("Calculating movie similarity matrix...")
movie_similarity = cosine_similarity(user_movie_matrix.T)
movie_similarity_df = pd.DataFrame(
    movie_similarity,
    index=user_movie_matrix.columns,
    columns=user_movie_matrix.columns
)
print("âœ“ Movie similarity matrix created!")

def get_collaborative_recommendations(movie_title, n_recommendations=10):
    """
    Get movie recommendations based on collaborative filtering
    """
    try:
        # Get movieId from title
        movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]

        # Check if movie exists in similarity matrix
        if movie_id not in movie_similarity_df.columns:
            return f"Not enough ratings for '{movie_title}' to generate recommendations."

        # Get similar movies
        similar_movies = movie_similarity_df[movie_id].sort_values(ascending=False)[1:n_recommendations+1]

        # Get movie details
        recommendations = movies[movies['movieId'].isin(similar_movies.index)][['title', 'genres']].copy()
        recommendations['similarity_score'] = similar_movies.values

        return recommendations

    except (KeyError, IndexError):
        return f"Movie '{movie_title}' not found or has insufficient data."

# Test collaborative filtering
popular_movie = movies[movies['movieId'].isin(popular_movies)]['title'].iloc[0]
print(f"\nCollaborative Filtering Recommendations for: '{popular_movie}'")
print(get_collaborative_recommendations(popular_movie, 5))

# ============================================
# STEP 6: HYBRID RECOMMENDATION SYSTEM
# ============================================
print("\n" + "="*60)
print("[6/6] Creating Hybrid Recommendation System...")
print("="*60)

def get_hybrid_recommendations(movie_title, n_recommendations=10, content_weight=0.5):
    """
    Combine content-based and collaborative filtering

    Parameters:
    - content_weight: weight for content-based (0 to 1), remaining goes to collaborative
    """
    # Get both types of recommendations
    content_recs = get_content_based_recommendations(movie_title, n_recommendations*2)
    collab_recs = get_collaborative_recommendations(movie_title, n_recommendations*2)

    # If either fails, return the other
    if isinstance(content_recs, str):
        return collab_recs
    if isinstance(collab_recs, str):
        return content_recs

    # Normalize scores to 0-1 range
    content_recs['norm_score'] = content_recs['similarity_score'] / content_recs['similarity_score'].max()
    collab_recs['norm_score'] = collab_recs['similarity_score'] / collab_recs['similarity_score'].max()

    # Combine recommendations
    content_recs['hybrid_score'] = content_recs['norm_score'] * content_weight
    collab_recs['hybrid_score'] = collab_recs['norm_score'] * (1 - content_weight)

    # Merge and aggregate scores
    all_recs = pd.concat([content_recs, collab_recs])
    hybrid_recs = all_recs.groupby('title').agg({
        'genres': 'first',
        'hybrid_score': 'sum'
    }).reset_index()

    # Sort and return top n
    hybrid_recs = hybrid_recs.sort_values('hybrid_score', ascending=False).head(n_recommendations)

    return hybrid_recs

# Test hybrid recommendations
print(f"\nHybrid Recommendations for: '{test_movie}'")
print(get_hybrid_recommendations(test_movie, 5))

# ============================================
# INTERACTIVE RECOMMENDATION FUNCTION
# ============================================

def recommend_movies(movie_title, method='hybrid', n=10):
    """
    Main function to get movie recommendations

    Parameters:
    - movie_title: Name of the movie
    - method: 'content', 'collaborative', or 'hybrid'
    - n: Number of recommendations
    """
    print(f"\n{'='*60}")
    print(f"RECOMMENDATIONS FOR: {movie_title}")
    print(f"Method: {method.upper()}")
    print('='*60)

    if method == 'content':
        result = get_content_based_recommendations(movie_title, n)
    elif method == 'collaborative':
        result = get_collaborative_recommendations(movie_title, n)
    else:  # hybrid
        result = get_hybrid_recommendations(movie_title, n)

    print(result)
    return result

# ============================================
# EXAMPLE USAGE
# ============================================
print("\n" + "="*60)
print("EXAMPLE RECOMMENDATIONS")
print("="*60)

# You can test with any movie from your dataset
# Example: recommend_movies('Toy Story (1995)', method='hybrid', n=5)

print("\nâœ… Recommendation System Built Successfully!")
print("\nðŸ“Š Performance Stats:")
print(f"   - Movies in database: {len(movies):,}")
print(f"   - Ratings used: {len(ratings):,}")
print(f"   - Users analyzed: {len(top_users):,}")
print(f"   - Movies with enough ratings: {len(popular_movies):,}")

print("\nðŸ’¡ How to use:")
print("1. Call recommend_movies('Movie Title', method='content')")
print("2. Methods: 'content', 'collaborative', or 'hybrid'")
print("3. Adjust n parameter for number of recommendations")
print("\nðŸ’¡ Tips:")
print("   - Use exact movie titles including year: 'Toy Story (1995)'")
print("   - 'hybrid' method usually gives best results")
print("   - Content-based works for all movies")
print("   - Collaborative needs movies with sufficient ratings")

# ============================================
# BONUS: EVALUATION METRICS
# ============================================

def evaluate_recommendations(user_id, n_recommendations=10):
    """
    Simple evaluation: recommend movies and check if user rated them highly
    """
    # Get user's highly rated movies (4+ stars)
    user_ratings = ratings[ratings['userId'] == user_id]
    liked_movies = user_ratings[user_ratings['rating'] >= 4]['movieId'].values

    if len(liked_movies) == 0:
        return "User has no highly rated movies."

    # Pick a random movie user liked
    test_movie_id = np.random.choice(liked_movies)
    test_movie_title = movies[movies['movieId'] == test_movie_id]['title'].values[0]

    # Get recommendations
    recs = get_collaborative_recommendations(test_movie_title, n_recommendations)

    if isinstance(recs, str):
        return recs

    # Check how many recommended movies user actually liked
    rec_movie_ids = movies[movies['title'].isin(recs['title'])]['movieId'].values
    hits = len(set(rec_movie_ids).intersection(set(liked_movies)))

    return {
        'test_movie': test_movie_title,
        'recommendations': recs['title'].tolist(),
        'hits': hits,
        'precision': hits / n_recommendations
    }

print("\n" + "="*60)
print("Sample Evaluation:")
sample_user = ratings['userId'].iloc[0]
eval_result = evaluate_recommendations(sample_user, 10)
print(eval_result)

print("\n" + "="*60)
print("ðŸŽ¬ Ready to recommend movies! Try it out:")
print("   recommend_movies('Inception (2010)', method='hybrid', n=5)")
print("="*60)

MOVIE RECOMMENDATION SYSTEM

[1/6] Loading data...
Original Movies dataset shape: (27278, 3)
Original Ratings dataset shape: (20000263, 4)

âš¡ Optimizing dataset for faster processing...
âœ“ Sampled 200,000 ratings for efficient processing
Working with 200,000 ratings

First few movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

[2/6] Exploring the data...

Total movies: 27,278
Total ratings (sampled): 200,000
Total users: 75,393
Average rati