# Movie Recommendation System 

## Objectives
In this project, we aim to build a **movie recommendation system** to assist users in discovering movies they are likely to enjoy. 
The system uses historical user-movie interaction data to predict preferences and recommends movies accordingly. 

We will explore popular recommendation techniques, including:
- Content based filtering
- Collaborative based filtering

The goal is to evaluate these methods and identify the most effective approach based on accuracy metrics like RMSE and precision.

Recommendation systems are critical in the era of data-driven personalization. Companies like Netflix, Amazon, and YouTube leverage recommendation algorithms to enhance user experience and increase engagement.


## Libraries imported

In [1]:
# Movie Recommendation System Notebook

# Section 1: Libraries and Setup
# Importing all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import json
import random# Ignore warnings for cleaner outputs
import pickle
import os
from datetime import datetime

# Machine Learning Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
# For deep learning later
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dot, Dropout
from tensorflow.keras.regularizers import l2


from tensorflow.keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from rapidfuzz import process, fuzz

### Dataset loading 

In [2]:
credits_df = pd.read_csv(r"D:\Programming\Dataset\tmdb_5000_credits.csv")
movies_df= pd.read_csv(r"D:\Programming\Dataset\tmdb_5000_movies.csv")


In [3]:
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
movies_df.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [6]:
credits_df.describe()

Unnamed: 0,movie_id
count,4803.0
mean,57165.484281
std,88694.614033
min,5.0
25%,9014.5
50%,14629.0
75%,58610.5
max,459488.0


In [7]:
movies_df.columns



Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [8]:
credits_df.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [9]:
# Rename the 'id' column in movies_df to 'movie_id'
movies_df.rename(columns={'id': 'movie_id'}, inplace=True)

# Merge the datasets
merged_df = movies_df.merge(credits_df, on='movie_id', how='left')

# Display the merged dataframe
merged_df.head()


Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
# Drop duplicates and unnecessary columns
merged_df = merged_df.drop(columns=['title_y'])
merged_df.rename(columns={'title_x': 'title'}, inplace=True)

In [11]:
merged_df.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

## Processing JSON-like Columns in the DataFrame
The provided code handles the processing of JSON-like columns in a pandas DataFrame. These columns contain structured data (often in list or dictionary formats), which need to be transformed into a more readable and usable format. The code achieves this by extracting specific fields from these JSON-like strings.



In [12]:


def parse_json_column(df, column, key, top_n=None):
    """
    Parses a JSON-like column and extracts specified keys.
    Args:
        df (pd.DataFrame): The DataFrame containing the column.
        column (str): The name of the column to process.
        key (str): The key to extract values from JSON.
        top_n (int, optional): Limit to the top N values.
    Returns:
        pd.Series: A Series of extracted values as strings.
    """
    def extract_values(row):
        try:
            data = json.loads(row)  # Parse JSON safely
            if isinstance(data, list):
                values = [item.get(key, '') for item in data if isinstance(item, dict)]
                if top_n:
                    values = values[:top_n]
                return ' '.join(values)
            return ''
        except (json.JSONDecodeError, TypeError):
            return ''
    
    return df[column].fillna('').astype(str).apply(extract_values)


def extract_from_crew(row, job):
    """
    Extracts names from the crew column based on job type.
    Args:
        row (str): JSON-like string of crew data.
        job (str): Job title to filter (e.g., "Director").
    Returns:
        str: Space-separated names matching the job.
    """
    try:
        data = json.loads(row)  # Parse JSON safely
        if isinstance(data, list):
            return ' '.join([item['name'] for item in data if isinstance(item, dict) and item.get('job', '').lower() == job.lower()])
        return ''
    except (json.JSONDecodeError, TypeError):
        return ''


# Process JSON-like columns efficiently
merged_df['genres_'] = parse_json_column(merged_df, 'genres', 'name')
merged_df['keywords_'] = parse_json_column(merged_df, 'keywords', 'name', top_n=5)
merged_df['production_companies_'] = parse_json_column(merged_df, 'production_companies', 'name', top_n=3)
merged_df['main_cast_'] = parse_json_column(merged_df, 'cast', 'name', top_n=5)
merged_df['spoken_languages_'] = parse_json_column(merged_df, 'spoken_languages', 'name')
merged_df['production_countries_'] = parse_json_column(merged_df, 'production_countries', 'name')

# Process 'crew' column for the 'Director' job
merged_df['director_'] = merged_df['crew'].fillna('').apply(lambda x: extract_from_crew(x, 'Director'))

# Display the updated DataFrame to check results
print(merged_df[['genres_', 'keywords_', 'production_companies_', 'main_cast_', 'spoken_languages_', 'production_countries_', 'director_']].head())


                                    genres_  \
0  Action Adventure Fantasy Science Fiction   
1                  Adventure Fantasy Action   
2                    Action Adventure Crime   
3               Action Crime Drama Thriller   
4          Action Adventure Science Fiction   

                                           keywords_  \
0  culture clash future space war space colony so...   
1  ocean drug abuse exotic island east india trad...   
2         spy based on novel secret agent sequel mi6   
3  dc comics crime fighter terrorist secret ident...   
4  based on novel mars medallion space travel pri...   

                               production_companies_  \
0  Ingenious Film Partners Twentieth Century Fox ...   
1  Walt Disney Pictures Jerry Bruckheimer Films S...   
2                       Columbia Pictures Danjaq B24   
3   Legendary Pictures Warner Bros. DC Entertainment   
4                               Walt Disney Pictures   

                                          m

In [13]:
merged_df.drop(columns=[
    'genres', 
    'keywords', 
    'production_companies', 
    'cast', 
    'spoken_languages', 
    'production_countries', 
    'crew'
], inplace=True)


In [14]:
merged_df.head()

Unnamed: 0,budget,homepage,movie_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,...,title,vote_average,vote_count,genres_,keywords_,production_companies_,main_cast_,spoken_languages_,production_countries_,director_
0,237000000,http://www.avatarmovie.com/,19995,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,2787965087,162.0,...,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Ingenious Film Partners Twentieth Century Fox ...,Sam Worthington Zoe Saldana Sigourney Weaver S...,English Español,United States of America United Kingdom,James Cameron
1,300000000,http://disney.go.com/disneypictures/pirates/,285,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,961000000,169.0,...,Pirates of the Caribbean: At World's End,6.9,4500,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Walt Disney Pictures Jerry Bruckheimer Films S...,Johnny Depp Orlando Bloom Keira Knightley Stel...,English,United States of America,Gore Verbinski
2,245000000,http://www.sonypictures.com/movies/spectre/,206647,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,2015-10-26,880674609,148.0,...,Spectre,6.3,4466,Action Adventure Crime,spy based on novel secret agent sequel mi6,Columbia Pictures Danjaq B24,Daniel Craig Christoph Waltz Léa Seydoux Ralph...,Français English Español Italiano Deutsch,United Kingdom United States of America,Sam Mendes
3,250000000,http://www.thedarkknightrises.com/,49026,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,2012-07-16,1084939099,165.0,...,The Dark Knight Rises,7.6,9106,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,Legendary Pictures Warner Bros. DC Entertainment,Christian Bale Michael Caine Gary Oldman Anne ...,English,United States of America,Christopher Nolan
4,260000000,http://movies.disney.com/john-carter,49529,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,2012-03-07,284139100,132.0,...,John Carter,6.1,2124,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,Walt Disney Pictures,Taylor Kitsch Lynn Collins Samantha Morton Wil...,English,United States of America,Andrew Stanton


In [15]:
filtered_df1 = merged_df.copy()


### Filtering Movies Based on Minimum Vote Count and Popularity
This section filters the movies dataset to retain only those with a vote count of at least MIN_VOTE_COUNT and a popularity score of at least MIN_POPULARITY. The criteria can be adjusted based on the dataset's characteristics or the desired level of filtering. After applying the filters, the number of remaining movies is displayed.

In [16]:
# Apply filters based on vote count and popularity
MIN_VOTE_COUNT = 50  # Adjust as per your dataset
MIN_POPULARITY = 5.0  # Adjust as per your dataset

filtered_df = filtered_df1[
    (filtered_df1['vote_count'] >= MIN_VOTE_COUNT) &
    (filtered_df1['popularity'] >= MIN_POPULARITY)

]
print(f"Number of movies after filtering: {filtered_df1.shape[0]}")


Number of movies after filtering: 4803


### Feature Scaling

In [17]:


# Normalize vote count and popularity
scaler = MinMaxScaler()
filtered_df1['scaled_vote_average'] = scaler.fit_transform(filtered_df1[['vote_average']])
filtered_df1['scaled_popularity'] = scaler.fit_transform(filtered_df1[['popularity']])
filtered_df1['scaled_vote_count'] = scaler.fit_transform(filtered_df1[['vote_count']])


In [18]:
print(filtered_df1.dtypes)


budget                     int64
homepage                  object
movie_id                   int64
original_language         object
original_title            object
overview                  object
popularity               float64
release_date              object
revenue                    int64
runtime                  float64
status                    object
tagline                   object
title                     object
vote_average             float64
vote_count                 int64
genres_                   object
keywords_                 object
production_companies_     object
main_cast_                object
spoken_languages_         object
production_countries_     object
director_                 object
scaled_vote_average      float64
scaled_popularity        float64
scaled_vote_count        float64
dtype: object


## Feature Engineering and Weighting

In [19]:
def repeat_text(text, times):
    """Repeat the text a specified number of times with space separation"""
    return ' '.join([text] * times)

def repeat_scaled_feature(value, times):
    """Repeat 'scaled' text based on scaled numerical value"""
    return ' '.join(['scaled'] * int(round(value * times)))

# Ensure DataFrame exists and contains necessary columns
if 'combined_features' in filtered_df1.columns:
    filtered_df1.drop(columns=['combined_features'], inplace=True)

filtered_df1['combined_features'] = (
    filtered_df1['genres_'].fillna('').apply(lambda x: repeat_text(x, 3)) + ' ' +
    filtered_df1['main_cast_'].fillna('').apply(lambda x: repeat_text(x, 2)) + ' ' +
    filtered_df1['director_'].fillna('').apply(lambda x: repeat_text(x, 2)) + ' ' +
    filtered_df1['overview'].fillna('') + ' ' +
    filtered_df1['keywords_'].fillna('') + ' ' +
    filtered_df1['production_companies_'].fillna('') + ' ' +
    filtered_df1['spoken_languages_'].fillna('') + ' ' +
    filtered_df1['scaled_vote_count'].fillna(0).apply(lambda x: repeat_scaled_feature(x, 4)) + ' ' +
    filtered_df1['scaled_popularity'].fillna(0).apply(lambda x: repeat_scaled_feature(x, 3)) + ' ' +
    filtered_df1['scaled_vote_average'].fillna(0).apply(lambda x: repeat_scaled_feature(x, 2)) + ' ' +
    filtered_df1['release_date'].fillna('').apply(lambda x: repeat_text(str(x), 1))  # Adding release year
)

# Preprocess combined features
filtered_df1['combined_features'] = filtered_df1['combined_features'].str.lower()

# Fit the TFIDF Vectorizer
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    smooth_idf=True  # Ensure numerical stability
)

# Fit-transform the combined features
tfidf_matrix = tfidf.fit_transform(filtered_df1['combined_features'])

# Optional: Calculate sparsity (if needed)
sparsity = 1.0 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]))
print(f"Matrix sparsity: {sparsity:.2%}")


Matrix sparsity: 99.01%


### Matrix Sparsity

The code optionally calculates the sparsity of the TF-IDF matrix, which indicates the proportion of zero values in the matrix. Sparsity is a common characteristic of TF-IDF matrices.. A sparsity of 98.88% means that the matrix is largely empty, with only 1.12% of its elements containing meaningful data. This is common in recommendation systems, where most users have interacted with only a small fraction of the available items.

## Autoencoder-Based Movie Recommendation 

This script implements a movie recommendation system that leverages an autoencoder for dimensionality reduction and similarity-based recommendations. Below 

In [20]:
# Autoencoder Recommender Class
class AutoencoderRecommender(BaseEstimator, RegressorMixin):
    """
    A movie recommendation system using an autoencoder for dimensional reduction
    and similarity-based recommendations.
    """
    
    def __init__(self, input_dim, hidden_units=512, latent_dim=128, 
                 dropout_rate=0.2, verbose=0):
        """
        Initialize the autoencoder-based recommender.
        
        Args:
            input_dim (int): Dimension of input features
            hidden_units (int): Number of units in the first hidden layer
            latent_dim (int): Dimension of the latent space
            dropout_rate (float): Dropout rate for regularization
            verbose (int): Verbosity level for model training
        """
        self.input_dim = input_dim
        self.hidden_units = hidden_units
        self.latent_dim = latent_dim
        self.dropout_rate = dropout_rate
        self.verbose = verbose
        self.model = None
        self.encoder = None
        
    def _create_model(self):
        """Create the autoencoder architecture."""
        input_layer = Input(shape=(self.input_dim,))
        
        # Encoder
        encoded = Dense(self.hidden_units, activation='relu', kernel_regularizer=l2(1e-4))(input_layer)
        encoded = Dropout(self.dropout_rate)(encoded)
        encoded = Dense(self.hidden_units // 2, activation='relu', kernel_regularizer=l2(1e-4))(encoded)
        encoded = Dropout(self.dropout_rate)(encoded)
        
        # Latent space
        latent = Dense(self.latent_dim, activation='tanh', name='Latent_Space')(encoded)
        
        # Decoder
        decoded = Dense(self.hidden_units // 2, activation='relu', kernel_regularizer=l2(1e-4))(latent)
        decoded = Dropout(self.dropout_rate)(decoded)
        decoded = Dense(self.hidden_units, activation='relu', kernel_regularizer=l2(1e-4))(decoded)
        decoded = Dense(self.input_dim, activation='sigmoid')(decoded)
        
        # Create and compile model
        model = Model(inputs=input_layer, outputs=decoded)
        model.compile(optimizer='adam', loss='cosine_similarity')
        
        return model
    
    def fit(self, X, y=None):
        """
        Fit the autoencoder model to the data.

        Args:
            X: Input features (TF-IDF matrix)
            y: Not used, included for sklearn compatibility

        Returns:
            self: The fitted model
        """
        self.model = self._create_model()

        early_stopping = EarlyStopping(
            monitor='loss',
            patience=3,
            restore_best_weights=True
        )

        self.model.fit(
            X, X,  # Autoencoder reconstructs the input
            validation_split=0.2,
            epochs=min(50, max(5, X.shape[0] // 100)),  # Fixed usage of X.shape[0]
            batch_size=256,
            verbose=self.verbose,
            callbacks=[early_stopping]
        )

        # Create encoder model
        encoder_layer = [layer for layer in self.model.layers if 'Latent_Space' in layer.name][0]
        self.encoder = Model(inputs=self.model.inputs, outputs=encoder_layer.output)

        return self
    
    def transform(self, X):
        """
        Transform data to latent space representations.
        
        Args:
            X: Input features to transform
            
        Returns:
            array: Latent space representations
        """
        if self.encoder is None:
            raise ValueError("Model must be fitted before transform.")
        return self.encoder.predict(X, verbose=self.verbose)
    
    def predict(self, X):
        """
        Reconstruct input data through the autoencoder.
        
        Args:
            X: Input features to reconstruct
            
        Returns:
            array: Reconstructed features
        """
        if self.model is None:
            raise ValueError("Model must be fitted before predict.")
        return self.model.predict(X, verbose=self.verbose)


# Movie Recommender Class
class MovieRecommender:
    """
    Movie recommendation system using autoencoder-based similarity.
    """
    
    def __init__(self, filtered_df1, tfidf_matrix, hidden_units=512):
        """
        Initialize the movie recommender.
        
        Args:
            filtered_df1: DataFrame containing movie information
            tfidf_matrix: TF-IDF features for movies
            hidden_units: Number of hidden units for autoencoder
        """
        self.df = filtered_df1
        self.tfidf_matrix = tfidf_matrix
        self.hidden_units = hidden_units
        self.model = None
        self.similarity_matrix = None
        
        # Preprocess title column
        self.df['title'] = self.df['title'].fillna('').str.lower()
        
        # Determine available columns
        self.available_columns = ['title']  # Title is required
        for col in ['genres', 'main_cast', 'director', 'year', 'rating']:
            if col in self.df.columns:
                self.available_columns.append(col)

    def train(self):
        """Train the autoencoder and compute similarity matrix."""
        # Convert sparse TF-IDF matrix to dense
        tfidf_dense = self.tfidf_matrix.toarray() if hasattr(self.tfidf_matrix, "toarray") else self.tfidf_matrix

        # Initialize and train autoencoder
        self.model = AutoencoderRecommender(
            input_dim=tfidf_dense.shape[1],
            hidden_units=self.hidden_units
        )
        self.model.fit(tfidf_dense)

        # Generate embeddings and compute similarity
        movie_embeddings = self.model.transform(tfidf_dense)
        self.similarity_matrix = cosine_similarity(movie_embeddings)
        
    def recommend(self, title, top_n=10, similarity_threshold=80):
        """
        Recommend movies similar to the given title.
        
        Args:
            title (str): Movie title to base recommendations on
            top_n (int): Number of recommendations to return
            similarity_threshold (int): Minimum similarity score for title matching
            
        Returns:
            DataFrame: Recommended movies with their details
        """
        try:
            # Normalize input title
            title = title.lower()
            
            # Find the closest matching title
            best_match = process.extractOne(
                title, 
                self.df['title'],
                scorer=fuzz.ratio
            )
            
            if best_match is None or best_match[1] < similarity_threshold:
                return f"No close matches found for '{title}'. Did you mean: {best_match[0]}?"
                
            movie_idx = self.df.index[self.df['title'] == best_match[0]][0]
            
            # Get similarity scores and indices
            sim_scores = list(enumerate(self.similarity_matrix[movie_idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
            
            # Return recommended movies with available columns
            recommendations = self.df.iloc[sim_indices][self.available_columns].copy()
            
            # Add similarity scores
            recommendations['similarity'] = [s[1] for s in sim_scores[1:top_n+1]]
            
            return recommendations
            
        except Exception as e:
            return f"An error occurred: {str(e)}"
            
    def get_available_columns(self):
        """Return the list of available columns in the dataset."""
        return self.available_columns


### Movie Recommendation System Using TF-IDF and Cosine Similarity
This code snippet demonstrates a simple movie recommendation system. It uses TF-IDF (Term Frequency-Inverse Document Frequency) to compute a similarity matrix based on textual data, such as movie descriptions or metadata. By calculating the cosine similarity between the target movie and all other movies, the system identifies the most similar ones.

The recommendations are refined further by adding additional criteria like vote count and popularity, ensuring the most relevant and highly-rated movies are suggested. Finally, the top 10 movies are presented as recommendations, excluding the target movie itself.

This system combines both content-based filtering (using cosine similarity) and popularity metrics for more precise results.

In [21]:


target_movie_idx = 0  # Index of the target movie
cosine_sim = cosine_similarity(tfidf_matrix)

# Get similarity scores for the target movie
similar_movies = list(enumerate(cosine_sim[target_movie_idx]))
similar_movies_sorted = sorted(similar_movies, key=lambda x: x[1], reverse=True)

# Refine recommendations by vote count and popularity
recommended_indices = [
    idx for idx, score in similar_movies_sorted if idx != target_movie_idx
]

# Add popularity and vote count as sorting criteria
final_recommendations = filtered_df1.iloc[recommended_indices].copy()
final_recommendations = final_recommendations.sort_values(
    by=['vote_count', 'popularity'], ascending=[False, False]
)

# Get top 10 recommended movies
recommended_titles = final_recommendations['title'].head(10).tolist()
print("Recommended Movies:")
print(recommended_titles)


Recommended Movies:
['Inception', 'The Dark Knight', 'The Avengers', 'Deadpool', 'Interstellar', 'Django Unchained', 'Guardians of the Galaxy', 'The Hunger Games', 'Mad Max: Fury Road', 'Fight Club']


## Movie Recommendation System with Similarity Grouping and Randomization
This advanced movie recommendation system builds on cosine similarity and introduces a unique grouping mechanism to diversify recommendations
This method ensures a balance between relevance (movies similar to the target) and variety (randomization within similarity groups). Finally, it recommends the top 10 movies by extracting titles from the dataset. This approach is particularly useful for generating fresh and varied recommendations for users.








In [22]:
# Calculate similarity scores
target_movie_idx = 0  # Example: Index of the target movie
filtered_indices = filtered_df.index.tolist()
filtered_tfidf_matrix = tfidf_matrix[filtered_indices]
cosine_sim_filtered = cosine_similarity(filtered_tfidf_matrix)

# Group and randomize similar movies
TOLERANCE = 0.05
similar_movies_filtered = list(enumerate(cosine_sim_filtered[target_movie_idx]))
similar_movies_filtered_sorted = sorted(similar_movies_filtered, key=lambda x: x[1], reverse=True)

# Group movies with similar scores
similar_groups_filtered = {}
for idx, score in similar_movies_filtered_sorted:
    rounded_score = np.round(score / TOLERANCE) * TOLERANCE
    if rounded_score not in similar_groups_filtered:
        similar_groups_filtered[rounded_score] = []
    similar_groups_filtered[rounded_score].append(idx)

# Shuffle within groups
for group in similar_groups_filtered.values():
    random.shuffle(group)

# Generate recommendations
final_recommendations = []
for score_group in sorted(similar_groups_filtered.keys(), reverse=True):
    final_recommendations.extend(similar_groups_filtered[score_group])

# Convert indices to movie titles
recommended_movies = [filtered_df.iloc[idx]["title"] for idx in final_recommendations[:10]]
print(recommended_movies)


['Avatar', 'Dragonball Evolution', 'X-Men: Days of Future Past', 'Star Trek Beyond', 'Alien', 'Aliens', 'Superman II', 'Man of Steel', 'Guardians of the Galaxy', 'Superman']


### Hybrid Movie Recommendation System Using Similarity, Popularity, and Votes
This code snippet creates a hybrid recommendation system by combining multiple factors into a weighted hybrid score
The system calculates a composite hybrid score for each movie and sorts them in descending order. The top 10 movies with the highest hybrid scores are recommended, providing a balanced mix of personalized relevance, audience appeal, and quality assurance. The final output includes movie titles and their corresponding hybrid scores.

In [23]:
# Create a hybrid score
filtered_df['similarity_score'] = cosine_sim_filtered[target_movie_idx]
filtered_df['hybrid_score'] = (
    0.6 * filtered_df['similarity_score'] + 
    0.3 * filtered_df['popularity'] + 
    0.1 * filtered_df['vote_count']
)

# Sort by hybrid score
hybrid_recommendations = filtered_df.sort_values('hybrid_score', ascending=False).head(10)
print(hybrid_recommendations[['title', 'hybrid_score']])


                       title  hybrid_score
96                 Inception   1425.547973
95              Interstellar   1304.051447
65           The Dark Knight   1256.425570
788                 Deadpool   1253.958580
0                     Avatar   1225.731273
16              The Avengers   1221.034992
94   Guardians of the Galaxy   1118.699484
127       Mad Max: Fury Road   1073.079714
287         Django Unchained   1034.559141
28            Jurassic World    991.911071


In [24]:

def jaccard_similarity(set1, set2):
    """
    Compute the Jaccard similarity between two sets.

    Args:
        set1, set2: Strings or sets to compare (e.g., genres, directors)
    """
    set1 = set(set1.split()) if isinstance(set1, str) else set()
    set2 = set(set2.split()) if isinstance(set2, str) else set()
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

# Define user preference similarity calculation
def calculate_user_preference_similarity(movie_metadata, user_profile, weights=None):
    """
    Calculate similarity between movie metadata and user profile.

    Args:
        movie_metadata: DataFrame of movie attributes (e.g., genres, directors)
        user_profile: Dictionary of user preferences (e.g., {'genres': ..., 'directors': ...})
        weights: Dictionary of weights for each attribute (e.g., {'genres': 0.5, 'directors': 0.5})
    """
    if weights is None:
        weights = {'genres_': 0.5, 'director_': 0.5}
    
    similarity = 0
    for attr, weight in weights.items():
        similarity += weight * jaccard_similarity(movie_metadata[attr], user_profile.get(attr, ""))
    return similarity

# Example User Profile
user_profile = {
    'genres': 'action comedy',
    'directors': 'steven spielberg',
}

# Assume `filtered_df` is a DataFrame with movie data
# Normalize relevant numerical columns for consistency
scaler = MinMaxScaler()
filtered_df[['similarity_score', 'popularity', 'vote_count']] = scaler.fit_transform(
    filtered_df[['similarity_score', 'popularity', 'vote_count']]
)

# Compute user preference similarity
filtered_df['user_similarity'] = filtered_df.apply(
    lambda x: calculate_user_preference_similarity(x, user_profile),
    axis=1
)

# Update hybrid score
filtered_df['hybrid_score'] = (
    0.6 * filtered_df['similarity_score'] +  # Content-based weight
    0.3 * filtered_df['popularity'] +        # Popularity-based weight
    0.1 * filtered_df['vote_count'] +        # Vote-based weight
    0.2 * filtered_df['user_similarity']     # User preference weight
)

# Recommend top 10 movies
personalized_recommendations = filtered_df.sort_values('hybrid_score', ascending=False).head(10)
print("Top 10 Personalized Recommendations:")
print(personalized_recommendations[['title', 'hybrid_score']])

Top 10 Personalized Recommendations:
                                                 title  hybrid_score
0                                               Avatar      0.735870
94                             Guardians of the Galaxy      0.404692
95                                        Interstellar      0.403907
546                                            Minions      0.366387
788                                           Deadpool      0.343068
127                                 Mad Max: Fury Road      0.312507
28                                      Jurassic World      0.303920
46                          X-Men: Days of Future Past      0.282847
14                                        Man of Steel      0.258420
199  Pirates of the Caribbean: The Curse of the Bla...      0.241425


### MovieRecommenderPersistence: Save and Load Models

This class handles saving and loading movie recommender models and their components.

#### Save Autoencoder-Based Recommender
```python
persistence.save_autoencoder_model(recommender, model_name='autoencoder_recommender')
```
- Saves the model (`autoencoder_model.h5`) and components (`df`, TF-IDF matrix, similarity matrix, etc.).

#### Save Hybrid Recommender
```python
persistence.save_hybrid_model(filtered_df, tfidf_matrix, cosine_sim, model_name='hybrid_recommender')
```
- Saves components like `filtered_df`, `tfidf_matrix`, and `cosine_sim`.

#### Load Autoencoder-Based Recommender
```python
recommender = persistence.load_autoencoder_model(model_path='path/to/model')
```
- Loads the saved autoencoder model and its components into a `MovieRecommender` instance.

#### Load Hybrid Recommender
```python
filtered_df, tfidf_matrix, cosine_sim = persistence.load_hybrid_model(model_path='path/to/model')
```
- Loads the saved components for a hybrid recommender.
```

In [25]:


class MovieRecommenderPersistence:
    """
    Handles saving and loading of different movie recommender models and their associated data.
    """

    def __init__(self, models_dir='saved_models'):
        """
        Initialize the persistence manager.

        Args:
            models_dir (str): Directory to store saved models
        """
        self.models_dir = models_dir
        os.makedirs(models_dir, exist_ok=True)

    def save_autoencoder_model(self, recommender, model_name='autoencoder_recommender.h5'):
        """
        Save autoencoder-based recommender model and its components.

        Args:
            recommender: MovieRecommender instance
            model_name: Name for the saved model
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_path = os.path.join(self.models_dir, f'{model_name}_{timestamp}')
        os.makedirs(save_path, exist_ok=True)

        # Save keras model
        recommender.model.model.save(os.path.join(save_path, 'autoencoder_model.h5'))  # Added .h5 extension

        # Save other components
        components = {
            'df': recommender.df,
            'tfidf_matrix': recommender.tfidf_matrix,
            'similarity_matrix': recommender.similarity_matrix,
            'available_columns': recommender.available_columns
        }

        with open(os.path.join(save_path, 'components.pkl'), 'wb') as f:
            pickle.dump(components, f)

        print(f"Model saved at: {save_path}")
        return save_path

    def save_hybrid_model(self, filtered_df, tfidf_matrix, cosine_sim, model_name='hybrid_recommender'):
        """
        Save hybrid recommendation model components.

        Args:
            filtered_df: DataFrame with movie data and scores
            tfidf_matrix: TF-IDF matrix
            cosine_sim: Cosine similarity matrix
            model_name: Name for the saved model
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_path = os.path.join(self.models_dir, f'{model_name}_{timestamp}')
        os.makedirs(save_path, exist_ok=True)

        components = {
            'filtered_df': filtered_df,
            'tfidf_matrix': tfidf_matrix,
            'cosine_sim': cosine_sim
        }

        with open(os.path.join(save_path, 'components.pkl'), 'wb') as f:
            pickle.dump(components, f)

        print(f"Model saved at: {save_path}")
        return save_path

    def load_autoencoder_model(self, model_path):
        """
        Load autoencoder-based recommender model.

        Args:
            model_path: Path to the saved model directory

        Returns:
            MovieRecommender instance
        """
        from tensorflow.keras.models import load_model

        # Load Keras model
        keras_model = load_model(os.path.join(model_path, 'autoencoder_model.h5'))

        # Load other components
        with open(os.path.join(model_path, 'components.pkl'), 'rb') as f:
            components = pickle.load(f)

        # Recreate MovieRecommender instance
        recommender = MovieRecommender(
            filtered_df1=components['df'],
            tfidf_matrix=components['tfidf_matrix']
        )
        recommender.model = keras_model
        recommender.similarity_matrix = components['similarity_matrix']
        recommender.available_columns = components['available_columns']

        return recommender

    def load_hybrid_model(self, model_path):
        """
        Load hybrid recommendation model components.

        Args:
            model_path: Path to the saved model directory

        Returns:
            tuple: (filtered_df, tfidf_matrix, cosine_sim)
        """
        with open(os.path.join(model_path, 'components.pkl'), 'rb') as f:
            components = pickle.load(f)

        return (
            components['filtered_df'],
            components['tfidf_matrix'],
            components['cosine_sim']
        )


In [26]:
# Import required libraries



# First create and train your recommender
recommender = MovieRecommender(filtered_df1=filtered_df1, tfidf_matrix=tfidf_matrix)
recommender.train()

# Save the internal Keras model (the autoencoder)
recommender.model.model.save('autoencoder_model.keras')

# Save other components using the persistence manager
persistence = MovieRecommenderPersistence()
save_path = persistence.save_autoencoder_model(recommender)
print(f"Complete model saved at: {save_path}")



Model saved at: saved_models\autoencoder_recommender.h5_20250102_125114
Complete model saved at: saved_models\autoencoder_recommender.h5_20250102_125114


In [27]:
# Save the model
persistence = MovieRecommenderPersistence()
save_path = persistence.save_hybrid_model(filtered_df, tfidf_matrix, cosine_sim)

# Load the model later
loaded_df, loaded_tfidf, loaded_cosine_sim = persistence.load_hybrid_model(save_path)

Model saved at: saved_models\hybrid_recommender_20250102_125115
