# Movie Recommendation Project

# Import Libraries

In [15]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import NearestNeighbors
from ast import literal_eval # For safely converting stringified lists

In [16]:
DATASET_PATH  = "D:/Github_ThisPC/TMDB_RecoFlow/airflow/data/cbf_movie.csv"

df = pd.read_csv(DATASET_PATH)
df.head(2)

Unnamed: 0,movie_id,title,genres,keywords,overview
0,353021,The Lankworm,drama,"womandirector, condom",Film about a couple of which the man is confro...
1,325023,The Datcha,adventure,womandirector,Jordy is an eleven year old boy who's family h...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135886 entries, 0 to 135885
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   movie_id  135886 non-null  int64 
 1   title     135886 non-null  object
 2   genres    135886 non-null  object
 3   keywords  135886 non-null  object
 4   overview  135886 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.2+ MB


In [18]:
df.shape

(135886, 5)

In [19]:
df.columns

Index(['movie_id', 'title', 'genres', 'keywords', 'overview'], dtype='object')

# Data Preprocessing

In [20]:
df.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
dtype: int64

In [21]:
# Count the number of duplicate rows
df.duplicated().sum()

np.int64(0)

Download necessary NLTK data
The code ensures that all required resources from the NLTK library are available for text processing. This is often necessary when working with stopwords or lemmatization. The resources are downloaded only if they are not already available.

In [22]:
# Download necessary NLTK data (only need to do this once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    nltk.download('wordnet')

# Download WordNet if not already available
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    nltk.download('wordnet')

In [23]:
# Preprocessing functions
stop_words = set(stopwords.words('english'))  # Define stop words for text processing
lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer for stemming words to their root form

Text Preprocessing Functions

In [24]:
def remove_stopwords(text):
    """Remove stopwords from a given text."""
    if isinstance(text, str):  # Check if the input is a string
        words = [word for word in text.split() if word.lower() not in stop_words]
        return " ".join(words)  # Join words back into a string
    return ""

def lemmatize_text(text):
    """Lemmatize the words in a given text."""
    if isinstance(text, str):  # Check if the input is a string
        words = [lemmatizer.lemmatize(word) for word in text.split()]
        return " ".join(words)  # Join lemmatized words back into a string
    return ""

In [25]:
def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets."""
    intersection = len(set1.intersection(set2))  # Size of intersection
    union = len(set1.union(set2))  # Size of union
    return intersection / union if union != 0 else 0  # Avoid division by zero 

The safe_literal_eval function safely evaluates strings that may represent data structures like lists, tuples, or dictionaries. If the value is a string, it attempts to parse it using literal_eval, and if that fails, it splits the string by commas into a list. If the value is None or NaN, it returns an empty list. For other types of input, the function simply returns the value as-is. The function is applied to the keywords and genres columns of a DataFrame, ensuring that string representations of lists or similar structures are properly converted into Python objects.

In [26]:
from ast import literal_eval
import numpy as np

def safe_literal_eval(value):
    """Safely evaluate a string to its literal representation."""
    if isinstance(value, str):  # Check if it's a string
        try:
            # Try to evaluate the string as a literal (e.g., list, tuple, dict)
            return literal_eval(value)
        except (ValueError, SyntaxError):
            # If evaluation fails, return the string as a list by splitting on commas
            return value.split(',') if value else []
    elif isinstance(value, float) and np.isnan(value):  # Handle NaN values
        return []  # Return an empty list for NaN values
    elif value is None:  # Handle None explicitly
        return []  # Return an empty list for None
    else:
        return value  # Return the value as-is if it's already in the correct format

# Apply the function to the specified columns
features = ['keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(safe_literal_eval)

In [27]:
# # Function to convert all strings to lower case and strip names of spaces
# def clean_data(x):
#     if isinstance(x, list):
#         return [str.lower(i.replace(" ", "")) for i in x]
#     else:
#         return ''
    
# # Apply clean_data function to your features.
# features = ['keywords', 'genres']

# for feature in features:
#     df[feature] = df[feature].apply(clean_data)

1. Replaced cosine_similarity(...) with NearestNeighbors

    cosine_similarity(tfidf_overview_matrix) \
    This computes a full pairwise similarity matrix for all movies. \
    If you had 60,000 movies, that’s 60,000 × 60,000 = 3.6 billion float values → ~29.1 GB in memory → 💥 MemoryError 

    Instead of calculating similarity between every pair, it only finds: \
    The K most similar movies for a given movie.

2. Introduced get_similarity_from_knn(...)
    Since we no longer have access to [i, j] entries in a cosine matrix, I added this helper:
    ```
    def get_similarity_from_knn(model, matrix, idx1, idx2):
    distances, indices = model.kneighbors(matrix[idx1], n_neighbors=50)
    if idx2 in indices[0]:
        position = list(indices[0]).index(idx2)
        return 1 - distances[0][position]
    return 0.0
    ```

    This:
    Finds top-50 most similar movies to idx1 \
    Checks if idx2 is among them \
    Returns the similarity value (1 - cosine distance) \
    ✅ This lets you still compute pairwise similarity on demand, but only for the movies you're actually comparing (not the full matrix).

3. Updated combined_similarity() to Use This Helper
    Instead of accessing: \
    cosine_sim_overview[i, j]

    I now use: \
    overview_sim = get_similarity_from_knn(overview_nn, tfidf_overview_matrix, i, j) \
    So your combined scoring logic still works the same — it just pulls similarities differently now.





In [28]:
# # Import necessary libraries
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd

# # Assuming you already have 'df' as your DataFrame
# # Handle missing values in overview and keywords columns
# df['overview'] = df['overview'].fillna('')  # Replace NaN in overview
# df['keywords'] = df['keywords'].fillna('')  # Replace NaN in keywords

# # Feature Extraction
# # Create TF-IDF matrix for movie overviews
# tfidf_overview = TfidfVectorizer(stop_words='english')
# tfidf_overview_matrix = tfidf_overview.fit_transform(df['overview'])

# # Convert keywords into strings (if they are lists) and create a TF-IDF matrix for keywords
# # Ensure keywords are properly joined into a string for vectorization
# keywords_ = [' '.join(keywords) if isinstance(keywords, list) else keywords for keywords in df['keywords']]
# vector = CountVectorizer(stop_words='english')
# vector_keywords_matrix = vector.fit_transform(keywords_)

# # Extract genres and titles as lists for similarity computation
# movie_genres = df['genres'].tolist()
# movie_titles = df['title'].tolist()
# movie_ids = df['movie_id'].tolist()

# # Calculate cosine similarity for both overview and keywords
# cosine_sim_overview = cosine_similarity(tfidf_overview_matrix)
# cosine_sim_keywords = cosine_similarity(vector_keywords_matrix)

### MEMOERY ERROR

In [None]:
df['overview'] = df['overview'].fillna('')
df['keywords'] = df['keywords'].fillna('')

tfidf_overview = TfidfVectorizer(stop_words='english')
tfidf_overview_matrix = tfidf_overview.fit_transform(df['overview'])

keywords_ = [' '.join(kw) if isinstance(kw, list) else kw for kw in df['keywords']]
tfidf_keywords = TfidfVectorizer(stop_words='english')
vector_keywords_matrix = tfidf_keywords.fit_transform(keywords_)

overview_nn = NearestNeighbors(n_neighbors=100, metric='cosine', algorithm='brute')
overview_nn.fit(tfidf_overview_matrix)
overview_distances, overview_indices = overview_nn.kneighbors(tfidf_overview_matrix)

keyword_nn = NearestNeighbors(n_neighbors=100, metric='cosine', algorithm='brute')
keyword_nn.fit(vector_keywords_matrix)
keyword_distances, keyword_indices = keyword_nn.kneighbors(vector_keywords_matrix)

movie_genres = df['genres'].tolist()
movie_titles = df['title'].tolist()
movie_ids = df['movie_id'].tolist()

In [30]:
def get_similarity_from_knn(model, matrix, idx1, idx2):
    distances, indices = model.kneighbors(matrix[idx1], n_neighbors=50)
    if idx2 in indices[0]:
        position = list(indices[0]).index(idx2)
        return 1 - distances[0][position]
    return 0.0

def combined_similarity(movie1_index, movie2_index, genre_weight=0.2, overview_weight=0.5, keyword_weight=0.3):
    genre_sim = jaccard_similarity(set(movie_genres[movie1_index]), set(movie_genres[movie2_index]))
    overview_sim = get_similarity_from_knn(overview_nn, tfidf_overview_matrix, movie1_index, movie2_index)
    keyword_sim = get_similarity_from_knn(keyword_nn, vector_keywords_matrix, movie1_index, movie2_index)
    return (genre_weight * genre_sim) + (overview_weight * overview_sim) + (keyword_weight * keyword_sim)

def get_recommendations_by_title(movie_title, top_n=10):
    if not isinstance(movie_title, str):
        return "Error: Movie title must be a string."

    matching_indices = [i for i, title in enumerate(movie_titles) if title == movie_title]
    if not matching_indices:
        return f"Error: '{movie_title}' not found in the dataset."
    if len(matching_indices) > 1:
        print(f"⚠️ Warning: The title '{movie_title}' appears {len(matching_indices)} times. Using the first occurrence.")

    movie_index = matching_indices[0]
    similarities = [
        (i, combined_similarity(movie_index, i))
        for i in range(len(movie_titles)) if i != movie_index
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)

    print(f"\nRecommendations for '{movie_title}' (index: {movie_index}):")
    for i, (idx, sim) in enumerate(similarities[:top_n], 1):
        print(f"{i}. Movie: {movie_titles[idx]} (ID: {movie_ids[idx]}), Similarity: {sim:.4f}")

def get_recommendations_by_id(input_id, top_n=10):
    try:
        movie_id = int(input_id)
    except (ValueError, TypeError):
        return "Error: Movie ID must be an integer."

    try:
        movie_index = movie_ids.index(movie_id)
    except ValueError:
        return f"Error: Movie ID {movie_id} not found in the dataset."

    similarities = [
        (i, combined_similarity(movie_index, i))
        for i in range(len(movie_ids)) if i != movie_index
    ]
    similarities.sort(key=lambda x: x[1], reverse=True)

    print(f"\nRecommendations for Movie ID {movie_id} ({movie_titles[movie_index]}):")
    for i, (idx, sim) in enumerate(similarities[:top_n], 1):
        print(f"{i}. Movie: {movie_titles[idx]} (ID: {movie_ids[idx]}), Similarity: {sim:.4f}")