In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import kagglehub
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/tmdb-movies-dataset-2023-930k-movies/TMDB_movie_dataset_v11.csv")
df.head()

In [None]:
# Check for null values in each column
null_counts = df.isnull().sum()
print(null_counts)

In [None]:

df = df.dropna(subset=['title','overview'])
# Filter rows where the 'language' column is 'en'
df_clean = df[df['original_language'] == 'en']

#check 
en = df_clean['original_language'] == 'en'
other = df_clean['original_language'] != 'en'

print("english lyrics:", en.sum())
print("other language lyrics:", other.sum())
null_counts_lang = df_clean['original_language'].isnull().sum()
print("null language values:",null_counts_lang)

In [None]:
df_clean.columns

In [None]:
#drop irrelevant columns 
columns_to_drop = ['vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 
       'popularity', 'poster_path', 'tagline',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords']
df_clean = df_clean.drop(columns=columns_to_drop)
df_clean.head()

In [None]:
# Check for null values in each column
null_counts = df_clean.isnull().sum()
print(null_counts)

In [None]:
df_clean.head()
num_rows_before = df.shape[0]
num_rows = df_clean.shape[0]
print(f"Number of rows in the dataset before: {num_rows_before}")
print(f"Number of rows in the dataset: {num_rows}")

In [None]:
pd.set_option('display.max_colwidth', None)

# Get a random sample of lyrics
print(df_clean['overview'].sample(1).iloc[0])

# Optionally reset column width back to default (optional)
pd.reset_option('display.max_colwidth')

# Data Cleaning

In [None]:
import re
# to reduce vocab size let s try to remove more unecessary symbols and parts 
# Define a cleaning function
def clean_overview(overview):
    # Remove section tags like [Intro], [Verse 1], etc.
    cleaned = re.sub(r'\[.*?\]', '', overview)
    
    # Remove credits or text after "---"
    cleaned = re.split(r'---', cleaned)[0]
    
    # Remove symbols except for line breaks (\n) and alphanumeric characters
    cleaned = re.sub(r'[^\w\s\n]', '', cleaned)
    
    # Remove extra whitespace and blank lines
    cleaned = re.sub(r'\n\s*\n', '\n', cleaned).strip()
    
    return cleaned

# Apply the cleaning function
df_clean['overview'] = df_clean['overview'].apply(clean_overview)
print(df_clean['overview'].head(5))

In [None]:
pd.set_option('display.max_colwidth', None)
# Get a random sample of lyrics
print(df_clean['overview'].sample(1).iloc[0])
# Optionally reset column width back to default (optional)
pd.reset_option('display.max_colwidth')

# Tokenization & Embedding
sentence-transformers/all-mpnet-base-v2 : This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.

In [None]:
# Set up device for GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# Load model from HuggingFace Hub and move to gpu
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = torch.nn.DataParallel(model)
model.to(device)
model.eval()  # Set model to evaluation mode

In [None]:
# Function to compute embedding
def compute_embedding(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
    # Convert to list for Parquet compatibility
    return sentence_embedding.squeeze().numpy().tolist()

In [None]:
# Function to compute embeddings in batches
def compute_embeddings_batch(texts, batch_size=128):
    embeddings_list = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        # Tokenize batch of texts
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
        # Move tensors to GPU
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        # Compute embeddings without gradient tracking
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
        # Move embeddings back to CPU and convert to list
        embeddings_list.extend(batch_embeddings.cpu().numpy().tolist())
        if (i // batch_size) % 100 == 0:
            print(f"Processed {i + len(batch_texts)} / {len(texts)} lyrics")
    return embeddings_list

In [None]:
# To handle 3 million rows, using process in chunks.
# Process 100,000 rows at a time and append the results to a list.
chunk_size = 100_000  # adjust based on your available memory and GPU
embedding_results = []
chunks = []
for start in range(0, len(df_clean), chunk_size):
    end = min(start + chunk_size, len(df_clean))
    df_chunk = df_clean.iloc[start:end].copy()
    print(f"Processing rows {start} to {end}")
    movies_list = df_chunk['overview'].tolist()
    df_chunk['embedding'] = compute_embeddings_batch(movies_list, batch_size=128)
    # Append chunk result (consider saving each chunk separately if memory is tight)
    chunks.append(df_chunk)

# Concatenate all chunks into a final DataFrame
df_embeddings = pd.concat(chunks, ignore_index=True)

# Save the DataFrame as a Parquet file.
# Ensure embeddings are stored as lists of floats.
df_embeddings.to_parquet('movies_embeddings.parquet', engine='pyarrow')
print("Embeddings and metadata saved as a Parquet file.")

# Final Stat:

Total Time 1h 6m 47s · GPU T4 x2

