In [None]:
!mkdir -p ~/.kaggle
from google.colab import files
uploaded = files.upload()

!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle API setup complete.")

!kaggle datasets download -d rounakbanik/the-movies-dataset
!unzip -q the-movies-dataset.zip -d ./
print("Dataset downloaded and unzipped.")

Saving kaggle.json to kaggle.json
Kaggle API setup complete.
Dataset URL: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
License(s): CC0-1.0
Downloading the-movies-dataset.zip to /content
 53% 121M/228M [00:00<00:00, 1.26GB/s]
100% 228M/228M [00:00<00:00, 620MB/s] 
Dataset downloaded and unzipped.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import ast
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import gc

# NLTK data download
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Transformers
from transformers import pipeline
import torch

# Sklearn (for Content-Based)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel # This is faster than cosine_similarity

# Set plotting style
sns.set(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (14, 8)

print("All libraries imported.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


All libraries imported.


In [None]:
print("--- Loading and Cleaning Data ---")
COLS_TO_LOAD = ['id', 'title', 'overview', 'popularity', 'vote_average',
                'vote_count', 'runtime', 'revenue', 'genres']

df = pd.read_csv('movies_metadata.csv', usecols=COLS_TO_LOAD, low_memory=False)
print(f"Original data shape (with fewer columns): {df.shape}")

# Fix bad 'id' entries
df = df[pd.to_numeric(df['id'], errors='coerce').notnull()]
df['id'] = df['id'].astype(int)

# Convert numeric columns
numeric_cols = ['popularity', 'vote_average', 'vote_count', 'runtime', 'revenue']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Replace 0s with NaN for revenue/runtime
df['revenue'] = df['revenue'].replace(0, np.nan)
df['runtime'] = df['runtime'].replace(0, np.nan)

# Handle NaNs
df = df.dropna(subset=['popularity', 'vote_average', 'vote_count'])
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('[]')

# --- Parse 'genres' Column ---
def parse_genres(genres_str):
    try:
        genres_list = ast.literal_eval(genres_str)
        if isinstance(genres_list, list):
            return [g['name'] for g in genres_list if 'name' in g]
    except (ValueError, SyntaxError):
        return []
    return []

df['genre_list'] = df['genres'].apply(parse_genres)
df['primary_genre'] = df['genre_list'].apply(lambda x: x[0] if len(x) > 0 else 'Unknown')

df_clean = df[['id', 'title', 'overview', 'popularity', 'vote_average', 'vote_count',
               'runtime', 'revenue', 'primary_genre', 'genre_list']].copy()

# Remove duplicates
df_clean = df_clean.drop_duplicates(subset='title')

print(f"Cleaned data shape: {df_clean.shape}")
print(df_clean.info())

del df
gc.collect()

--- Loading and Cleaning Data ---
Original data shape (with fewer columns): (45466, 9)
Cleaned data shape: (42277, 10)
<class 'pandas.core.frame.DataFrame'>
Index: 42277 entries, 0 to 45465
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             42277 non-null  int64  
 1   title          42277 non-null  object 
 2   overview       42277 non-null  object 
 3   popularity     42277 non-null  float64
 4   vote_average   42277 non-null  float64
 5   vote_count     42277 non-null  float64
 6   runtime        40545 non-null  float64
 7   revenue        6944 non-null   float64
 8   primary_genre  42277 non-null  object 
 9   genre_list     42277 non-null  object 
dtypes: float64(5), int64(1), object(4)
memory usage: 3.5+ MB
None


739

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

print("Loading Models...")

# MODEL 1: Semantic Encoder (The "Topic Modeling" / Meaning part)
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# MODEL 2: Mood Labeler (To generate the tags)
mood_classifier = pipeline(
    "text-classification",
    model="bsingh/roberta_goEmotion",
    top_k=None,
    truncation=True,
    max_length=512
)


GOEMOTIONS_LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

target_df = df_clean.head(100).copy()

print("Step 1: Generating Semantic Vectors (The 'Vibe' Encoding)...")
movie_vectors = semantic_model.encode(target_df['overview'].tolist(), show_progress_bar=True)

print("Step 2: Generating Mood Labels (Top 3 per movie)...")
multi_label_data = []

def get_top_moods(text):
    try:
        results = mood_classifier(text)[0]
        # Format: [('joy', 0.9), ('optimism', 0.5)...]

        cleaned_moods = []
        for item in results:
            label = item['label']
            score = item['score']
            if label.startswith("LABEL_"):
                idx = int(label.split("_")[1])
                label = GOEMOTIONS_LABELS[idx]
            cleaned_moods.append((label, score))

        return cleaned_moods
    except:
        return []

for index, row in tqdm(target_df.iterrows(), total=target_df.shape[0]):
    # Get the moods
    moods = get_top_moods(row['overview'])

    top_3_names = [m[0] for m in moods[:3]]

    multi_label_data.append({
        'title': row['title'],
        'mood_tags': top_3_names, # List of strings
        'primary_genre': row['primary_genre']
    })

df_results = pd.DataFrame(multi_label_data)

def find_movie_by_vibe(user_input, top_n=5):

    print(f"\nUser Query: '{user_input}'")

    # 1. Encode User Input
    user_vector = semantic_model.encode([user_input])

    # 2. Compute Similarity (User Vector vs All Movie Vectors)
    similarities = cosine_similarity(user_vector, movie_vectors)[0]

    # 3. Find Top N Indices
    top_indices = similarities.argsort()[-top_n:][::-1]

    print(f"--- Top {top_n} Semantic Matches ---")
    for idx in top_indices:
        score = similarities[idx]
        movie_data = df_results.iloc[idx]

        title = movie_data['title']
        genres = movie_data['primary_genre']
        tags = ", ".join(movie_data['mood_tags'])

        print(f"Movie: {title} (Score: {score:.4f})")
        print(f"   Generated Tags: [{tags}]")
        print(f"   Genre: {genres}")
        print("-" * 30)

# Test 1: Complex Vibe
find_movie_by_vibe("I want a complex story about love that ends tragically and makes me cry")

# Test 2: Specific Topic
find_movie_by_vibe("Space adventure with aliens and futuristic battles")

# Test 3: Emotional state
find_movie_by_vibe("I need something inspiring to lift my mood")

Loading Models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


Step 1: Generating Semantic Vectors (The 'Vibe' Encoding)...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Step 2: Generating Mood Labels (Top 3 per movie)...


100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


User Query: 'I want a complex story about love that ends tragically and makes me cry'
--- Top 5 Semantic Matches ---
Movie: Big Bully (Score: 0.4283)
   Generated Tags: [sadness, neutral, disappointment]
   Genre: Comedy
------------------------------
Movie: Dead Man Walking (Score: 0.3588)
   Generated Tags: [neutral, caring, admiration]
   Genre: Drama
------------------------------
Movie: Bed of Roses (Score: 0.3504)
   Generated Tags: [love, admiration, neutral]
   Genre: Drama
------------------------------
Movie: Restoration (Score: 0.3310)
   Generated Tags: [neutral, admiration, love]
   Genre: Drama
------------------------------
Movie: Eye for an Eye (Score: 0.3235)
   Generated Tags: [neutral, approval, annoyance]
   Genre: Drama
------------------------------

User Query: 'Space adventure with aliens and futuristic battles'
--- Top 5 Semantic Matches ---
Movie: Mortal Kombat (Score: 0.3406)
   Generated Tags: [neutral, approval, optimism]
   Genre: Action
-----------------


