# Movie Recommendation Project

# Import Libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from ast import literal_eval # For safely converting stringified lists

In [18]:
DATASET_PATH  = "D:/Github_ThisPC/TMDB_RecoFlow/airflow/data/cbf_movie.parquet"

df = pd.read_parquet(DATASET_PATH)
df.head(2)

Unnamed: 0,movie_id,title,genres,keywords,overview
0,223195,The Making of a Legend: Gone with the Wind,documentary,"cinemahistory, makingof, moviebusiness, behind...",This documentary revisits the making of Gone w...
1,49343,Dream Demon,horror,"london, dreamdemon, bride-to-be, mirror, dream...",As her marriage to decorated war hero Oliver d...


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135886 entries, 0 to 135885
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   movie_id  135886 non-null  int64 
 1   title     135886 non-null  object
 2   genres    135886 non-null  object
 3   keywords  135886 non-null  object
 4   overview  135886 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.2+ MB


In [20]:
df.shape

(135886, 5)

In [21]:
df.columns

Index(['movie_id', 'title', 'genres', 'keywords', 'overview'], dtype='object')

# Data Preprocessing

In [22]:
df.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
dtype: int64

In [23]:
# Count the number of duplicate rows
df.duplicated().sum()

np.int64(0)

Download necessary NLTK data
The code ensures that all required resources from the NLTK library are available for text processing. This is often necessary when working with stopwords or lemmatization. The resources are downloaded only if they are not already available.

In [24]:
# Download necessary NLTK data (only need to do this once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    nltk.download('wordnet')

# Download WordNet if not already available
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    nltk.download('wordnet')

In [25]:
# Preprocessing functions
stop_words = set(stopwords.words('english'))  # Define stop words for text processing
lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer for stemming words to their root form

Text Preprocessing Functions

In [26]:
def remove_stopwords(text):
    """Remove stopwords from a given text."""
    if isinstance(text, str):  # Check if the input is a string
        words = [word for word in text.split() if word.lower() not in stop_words]
        return " ".join(words)  # Join words back into a string
    return ""

def lemmatize_text(text):
    """Lemmatize the words in a given text."""
    if isinstance(text, str):  # Check if the input is a string
        words = [lemmatizer.lemmatize(word) for word in text.split()]
        return " ".join(words)  # Join lemmatized words back into a string
    return ""

In [27]:
def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))  # Size of intersection
    union = len(set1.union(set2))  # Size of union
    return intersection / union if union != 0 else 0  # Avoid division by zero 

The safe_literal_eval function safely evaluates strings that may represent data structures like lists, tuples, or dictionaries. If the value is a string, it attempts to parse it using literal_eval, and if that fails, it splits the string by commas into a list. If the value is None or NaN, it returns an empty list. For other types of input, the function simply returns the value as-is. The function is applied to the keywords and genres columns of a DataFrame, ensuring that string representations of lists or similar structures are properly converted into Python objects.

In [28]:
from ast import literal_eval
import numpy as np

def safe_literal_eval(value):
    """Safely evaluate a string to its literal representation."""
    if isinstance(value, str):  # Check if it's a string
        try:
            # Try to evaluate the string as a literal (e.g., list, tuple, dict)
            return literal_eval(value)
        except (ValueError, SyntaxError):
            # If evaluation fails, return the string as a list by splitting on commas
            return value.split(',') if value else []
    elif isinstance(value, float) and np.isnan(value):  # Handle NaN values
        return []  # Return an empty list for NaN values
    elif value is None:  # Handle None explicitly
        return []  # Return an empty list for None
    else:
        return value  # Return the value as-is if it's already in the correct format

# Apply the function to the specified columns
features = ['keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(safe_literal_eval)

It first limits the data to the first 25,000 (RAM issue) rows for efficiency and fills missing values in the overview and keywords columns with empty strings. Then, it uses TF-IDF vectorization on the overviews and CountVectorizer on the keywords (after ensuring the keywords are properly formatted as strings). Cosine similarity is computed for both overviews and keywords to measure similarity between movies. This setup allows for the creation of a recommendation system by comparing movies based on either their content or their keywords

In [29]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        return ''
        

# Apply clean_data function to your features.
features = ['keywords', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [30]:
# ==== For Only Testing =====
recommended_titles = [
    "The Dark Knight Rises",
    "The Dark Knight",
    "Batman Begins",
    "Batman: The Long Halloween, Part One",
    "Batman: The Long Halloween, Part Two",
    "Batman vs. Two-Face",
    "The Siege",
    "The Batman",
    "Class of 1984",
    "The Negotiator",
    "Double Impact"
]
df_recommendations = df[df["title"].isin(recommended_titles)]
df = df[:25000] # Filter first 25000
df = pd.concat([df_recommendations, df]).drop_duplicates(subset=["movie_id"])

# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assuming you already have 'df' as your DataFrame
# Handle missing values in overview and keywords columns
df['overview'] = df['overview'].fillna('')  # Replace NaN in overview
df['keywords'] = df['keywords'].fillna('')  # Replace NaN in keywords

df = df.sort_values("title").reset_index(drop=True)

# Feature Extraction
# Create TF-IDF matrix for movie overviews
tfidf_overview = TfidfVectorizer(stop_words='english')
tfidf_overview_matrix = tfidf_overview.fit_transform(df['overview'])

# Convert keywords into strings (if they are lists) and create a TF-IDF matrix for keywords
# Ensure keywords are properly joined into a string for vectorization
keywords_ = [' '.join(keywords) if isinstance(keywords, list) else keywords for keywords in df['keywords']]
vector = CountVectorizer(stop_words='english')
vector_keywords_matrix = vector.fit_transform(keywords_)

# Extract genres and titles as lists for similarity computation
movie_genres = df['genres'].tolist()
movie_titles = df['title'].tolist()
movie_ids = df['movie_id'].tolist()

# Calculate cosine similarity for both overview and keywords
cosine_sim_overview = cosine_similarity(tfidf_overview_matrix)
cosine_sim_keywords = cosine_similarity(vector_keywords_matrix)

In [31]:
def combined_similarity(movie1_index, movie2_index, genre_weight=0.2, overview_weight=0.5, keyword_weight=0.3):
    """
    Combine similarity scores from genres, overview, and keywords with adjustable weights.
    """
    genre_sim = jaccard_similarity(set(movie_genres[movie1_index]), set(movie_genres[movie2_index]))  # Jaccard similarity for genres
    overview_sim = cosine_sim_overview[movie1_index, movie2_index]  # Cosine similarity for overviews
    keyword_sim = cosine_sim_keywords[movie1_index, movie2_index]  # Cosine similarity for keywords
    return (genre_weight * genre_sim) + (overview_weight * overview_sim) + (keyword_weight * keyword_sim)  # Weighted sum of similarities


def get_recommendations_by_title(movie_title, top_n=10):
    """
    Get top N recommendations for a given movie title based on combined similarity scores.
    Warns if the title appears more than once in the dataset.
    """
    # Check if the input is a string
    if not isinstance(movie_title, str):
        return "Error: Movie title must be a string."

    # Count how many times the title appears
    matching_indices = [i for i, title in enumerate(movie_titles) if title == movie_title]

    if not matching_indices:
        return f"Error: '{movie_title}' not found in the dataset."

    if len(matching_indices) > 1:
        print(f"⚠️ Warning: The title '{movie_title}' appears {len(matching_indices)} times. Using the first occurrence.")

    movie_index = matching_indices[0]

    # Calculate similarity for all other movies
    similarities = [
        (i, combined_similarity(movie_index, i))
        for i in range(len(movie_titles)) if i != movie_index
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)

    print(f"\nRecommendations for '{movie_title}' (index: {movie_index}):")
    for i, (idx, sim) in enumerate(similarities[:top_n], 1):
        print(f"{i}. Movie: {movie_titles[idx]} (ID: {movie_ids[idx]}), Similarity: {sim:.4f}")


def get_recommendations_by_id(input_id, top_n=10):
    """
    Get top N recommendations for a given movie ID based on combined similarity scores.
    """
    try:
        # Try to convert the input to integer
        movie_id = int(input_id)
    except (ValueError, TypeError):
        return "Error: Movie ID must be an integer."

    try:
        movie_index = movie_ids.index(movie_id)
    except ValueError:
        return f"Error: Movie ID {movie_id} not found in the dataset."

    # Compute similarity scores
    similarities = [
        (i, combined_similarity(movie_index, i))
        for i in range(len(movie_ids)) if i != movie_index
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)

    print(f"\nRecommendations for Movie ID {movie_id} ({movie_titles[movie_index]}):")
    for i, (idx, sim) in enumerate(similarities[:top_n], 1):
        print(f"{i}. Movie: {movie_titles[idx]} (ID: {movie_ids[idx]}), Similarity: {sim:.4f}")

In [32]:
# Example Usage
recommendations = get_recommendations_by_title('The Dark Knight Rises')  # Get recommendations for 'The Dark Knight Rises'
print(recommendations)

recommendations = get_recommendations_by_title('Non Existent Movie')  # Test with a movie not in the dataset
print(recommendations)

recommendations = get_recommendations_by_title(12345)  # Test with a non-string input
print(recommendations)


Recommendations for 'The Dark Knight Rises' (index: 19065):
1. Movie: The Dark Knight (ID: 155), Similarity: 0.4503
2. Movie: Batman Begins (ID: 272), Similarity: 0.3405
3. Movie: Batman: The Long Halloween, Part One (ID: 736073), Similarity: 0.3232
4. Movie: Batman: The Long Halloween, Part Two (ID: 736074), Similarity: 0.3024
5. Movie: Batman vs. Two-Face (ID: 464882), Similarity: 0.2858
6. Movie: The Batman (ID: 414906), Similarity: 0.2651
7. Movie: The Siege (ID: 9882), Similarity: 0.2562
8. Movie: Class of 1984 (ID: 11564), Similarity: 0.2443
9. Movie: The Bullet Train (ID: 47634), Similarity: 0.2397
10. Movie: The Negotiator (ID: 9631), Similarity: 0.2391
None
Error: 'Non Existent Movie' not found in the dataset.
Error: Movie title must be a string.
