<a href="https://colab.research.google.com/github/Apoorvkhanna2/CineMatch/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install -q transformers gradio scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm.notebook import tqdm
import os

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.9/322.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import os


In [None]:
# Download the MovieLens dataset
!wget -q https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -q ml-latest-small.zip

In [None]:
# Load the dataset
movies_df = pd.read_csv('ml-latest-small/movies.csv')
links_df = pd.read_csv('ml-latest-small/links.csv')

In [None]:
# Merge the dataframes
movies_df = pd.merge(movies_df, links_df, on='movieId')

# Display the first few rows
print("Dataset Overview:")
print(f"Number of movies: {len(movies_df)}")
movies_df.head()

Dataset Overview:
Number of movies: 9742


Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [None]:
!pip install gradio



In [None]:
# Data preprocessing
import re

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Import AutoTokenizer and AutoModel from transformers
from transformers import AutoTokenizer, AutoModel

In [None]:
# Import the tqdm module
from tqdm.notebook import tqdm

In [None]:
# Data preprocessing
def clean_title(title):
    # Remove year from title
    return re.sub(r"\s*\(\d{4}\)\s*$", "", title)

movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# Convert genres from pipe-separated string to a clean text format
movies_df['genres'] = movies_df['genres'].str.replace('|', ' ')

# Create a text field combining title and genres for better recommendations
movies_df['text_features'] = movies_df['clean_title'] + ' ' + movies_df['genres']

# TF-IDF based recommendation system
print("Building TF-IDF vectors...")
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['text_features'])
tfidf_similarity = cosine_similarity(tfidf_matrix)

# Function to get TF-IDF based recommendations
def get_tfidf_recommendations(movie_title, similarity_matrix=tfidf_similarity, df=movies_df, n=10):
    # Find movies with similar titles
    matches = df[df['clean_title'].str.contains(movie_title, case=False)]

    if len(matches) == 0:
        return pd.DataFrame(columns=['movieId', 'title', 'genres', 'similarity'])

    # If multiple matches, select the first one
    movie_idx = matches.iloc[0].name

    # Get similarity scores
    similarity_scores = list(enumerate(similarity_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top N movie indices (excluding the movie itself)
    top_indices = [i[0] for i in similarity_scores[1:n+1]]

    # Create a dataframe with recommended movies
    recommended_movies = df.iloc[top_indices].copy()
    recommended_movies['similarity'] = [i[1] for i in similarity_scores[1:n+1]]

    return recommended_movies[['movieId', 'title', 'genres', 'similarity']]

# BERT-based recommendation system
print("Loading BERT model...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for text
def get_bert_embedding(text, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Use CLS token embedding as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Only process a subset of movies to save computation time in Colab
# In a production system, you would process all movies or use batched processing
print("Generating BERT embeddings for movies...")
sample_size = min(2000, len(movies_df))  # Limit to 2000 movies for demo
sample_indices = np.random.choice(len(movies_df), sample_size, replace=False)
sample_movies_df = movies_df.iloc[sample_indices].copy()

# Create BERT embeddings (this might take some time)
bert_embeddings = []
for text in tqdm(sample_movies_df['text_features']):
    embedding = get_bert_embedding(text)
    bert_embeddings.append(embedding[0])

# Convert list to numpy array
bert_embeddings = np.array(bert_embeddings)

# Compute similarity matrix for BERT embeddings
bert_similarity = cosine_similarity(bert_embeddings)

# Function to get BERT-based recommendations
def get_bert_recommendations(movie_title, similarity_matrix=bert_similarity, df=sample_movies_df, n=10):
    # Find movies with similar titles
    matches = df[df['clean_title'].str.contains(movie_title, case=False)]

    if len(matches) == 0:
        return pd.DataFrame(columns=['movieId', 'title', 'genres', 'similarity'])

    # If multiple matches, select the first one
    movie_idx = matches.iloc[0].name

    # Get local index in the sample dataframe
    local_idx = df.index.get_loc(movie_idx)

    # Get similarity scores
    similarity_scores = list(enumerate(similarity_matrix[local_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top N movie indices (excluding the movie itself)
    top_scores = similarity_scores[1:n+1]
    top_indices = [df.index[i[0]] for i in top_scores]

    # Create a dataframe with recommended movies
    recommended_movies = movies_df.loc[top_indices].copy()
    recommended_movies['similarity'] = [i[1] for i in top_scores]

    return recommended_movies[['movieId', 'title', 'genres', 'similarity']]

# Combined recommendation function (FIXED)
def get_recommendations(movie_title, method='combined', n=10):
    if method == 'tfidf':
        return get_tfidf_recommendations(movie_title, n=n)
    elif method == 'bert':
        return get_bert_recommendations(movie_title, n=n)
    else:  # combined
        # Get recommendations from both methods
        tfidf_recs = get_tfidf_recommendations(movie_title, n=n*2)
        bert_recs = get_bert_recommendations(movie_title, n=n*2)

        # If either method fails, return results from the other
        if len(tfidf_recs) == 0:
            return bert_recs.head(n) if len(bert_recs) > 0 else pd.DataFrame(columns=['movieId', 'title', 'genres', 'similarity'])
        if len(bert_recs) == 0:
            return tfidf_recs.head(n)

        # Create a unified DataFrame with a source column to track origin
        tfidf_recs['source'] = 'tfidf'
        bert_recs['source'] = 'bert'

        # Concatenate the results
        all_recs = pd.concat([tfidf_recs, bert_recs])

        # Remove duplicates by keeping the highest similarity score for each movie
        all_recs = all_recs.sort_values('similarity', ascending=False)
        all_recs = all_recs.drop_duplicates(subset='movieId', keep='first')

        # Return the top N recommendations
        return all_recs.head(n)

# Sample test
test_movie = "Toy Story"
print(f"\nTest recommendations for '{test_movie}':")
recommendations = get_recommendations(test_movie, method='combined', n=5)
print(recommendations)

# Gradio Interface
def recommend_movies(movie_title, recommendation_method, num_recommendations):
    if not movie_title:
        return None, "Please enter a movie title"

    results = get_recommendations(movie_title, method=recommendation_method, n=num_recommendations)

    if len(results) == 0:
        return None, f"No movie found with title containing '{movie_title}'"

    # Prepare data for visualization
    movie_data = []
    for _, row in results.iterrows():
        movie_info = f"**{row['title']}**\n\nGenres: {row['genres']}\n\nSimilarity: {row['similarity']:.2f}"
        movie_data.append(movie_info)

    # Create a nice visualization
    fig, ax = plt.subplots(figsize=(12, 6))

    # Create a bar chart of similarity scores
    movie_titles = results['title'].str.replace(r' \(\d{4}\)', '', regex=True).values
    similarity_scores = results['similarity'].values

    # Truncate long titles for better display
    shortened_titles = [t[:20] + '...' if len(t) > 20 else t for t in movie_titles]

    # Create a colorful bar chart
    bars = ax.barh(shortened_titles, similarity_scores, color=sns.color_palette("viridis", len(movie_titles)))

    # Add similarity scores to the end of each bar
    for i, (bar, score) in enumerate(zip(bars, similarity_scores)):
        ax.text(score + 0.01, bar.get_y() + bar.get_height()/2, f'{score:.2f}',
                va='center', fontweight='bold')

    # Add title and labels
    ax.set_title(f'Movies Similar to "{movie_title}"', fontsize=15, fontweight='bold')
    ax.set_xlabel('Similarity Score', fontsize=12)
    ax.set_xlim(0, 1.1)  # Set x-axis limit

    # Remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Tight layout
    plt.tight_layout()

    return fig, "\n\n".join(movie_data)

# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎬 Movie Recommendation System

        Enter a movie title to get personalized recommendations based on BERT and TF-IDF algorithms.

        This system analyzes movie titles and genres to find films with similar content and themes.
        """
    )

    with gr.Row():
        with gr.Column(scale=3):
            movie_input = gr.Textbox(
                label="Enter a Movie Title",
                placeholder="e.g. The Godfather, Toy Story, Inception...",
                info="Type part of a movie title to search"
            )

            with gr.Row():
                method_select = gr.Radio(
                    ["tfidf", "bert", "combined"],
                    label="Recommendation Method",
                    value="combined",
                    info="TF-IDF focuses on keywords, BERT understands context, Combined uses both"
                )
                num_recommendations = gr.Slider(
                    minimum=5,
                    maximum=20,
                    value=10,
                    step=1,
                    label="Number of Recommendations"
                )

            submit_btn = gr.Button("Get Recommendations", variant="primary")

        with gr.Column(scale=4):
            output_plot = gr.Plot(label="Recommendation Visualization")
            output_text = gr.Markdown(label="Recommended Movies")

    # Set up the button click event
    submit_btn.click(
        recommend_movies,
        inputs=[movie_input, method_select, num_recommendations],
        outputs=[output_plot, output_text]
    )

    gr.Markdown(
        """
        ### How It Works

        This system uses two complementary techniques:

        - **TF-IDF**: Analyzes word frequency in movie titles and genres to find statistical similarities
        - **BERT**: Uses advanced natural language understanding to capture semantic meaning and context
        - **Combined**: Blends both approaches for well-rounded recommendations

        *Created with ❤️ using Gradio, Transformers, and Scikit-learn*
        """
    )

# Launch the interface
demo.launch(debug=True)

Building TF-IDF vectors...
Loading BERT model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT embeddings for movies...


  0%|          | 0/2000 [00:00<?, ?it/s]


Test recommendations for 'Toy Story':
      movieId                                     title  \
2355     3114                        Toy Story 2 (1999)   
8039    98243              Rise of the Guardians (2012)   
7133    71264  Cloudy with a Chance of Meatballs (2009)   
7761    91386                     Happy Feet Two (2011)   
7302    76093           How to Train Your Dragon (2010)   

                                           genres  similarity source  
2355  Adventure Animation Children Comedy Fantasy    1.000000  tfidf  
8039    Adventure Animation Children Fantasy IMAX    0.981885   bert  
7133              Animation Children Fantasy IMAX    0.977706   bert  
7761               Animation Children Comedy IMAX    0.976091   bert  
7302    Adventure Animation Children Fantasy IMAX    0.976022   bert  
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off 