In [None]:
import os
from ast import literal_eval
from typing import List

import kagglehub
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def download_movies_dataset() -> str:
    """
    Download the 'rounakbanik/the-movies-dataset' from Kaggle using kagglehub
    and return the local path where it is stored.
    """
    path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")
    return path


def _safe_literal_eval(val):
    if pd.isna(val):
        return []
    if isinstance(val, list):
        return val
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError):
        return []


def load_and_prepare_movies(dataset_path: str) -> pd.DataFrame:
    """
    Load movie data from the Kaggle dataset and create a 'metadata_text' column
    suitable for building a content-based recommender.

    Returns a DataFrame with at least:
      - id
      - title
      - metadata_text
    """
    movies_path = os.path.join(dataset_path, "movies_metadata.csv")
    keywords_path = os.path.join(dataset_path, "keywords.csv")

    movies = pd.read_csv(movies_path, low_memory=False)
    keywords = pd.read_csv(keywords_path)

    # Ensure IDs are numeric and align between files
    movies["id"] = pd.to_numeric(movies["id"], errors="coerce")
    keywords["id"] = pd.to_numeric(keywords["id"], errors="coerce")

    movies = movies.dropna(subset=["id"])
    keywords = keywords.dropna(subset=["id"])

    # Merge keywords into movies
    movies = movies.merge(keywords, on="id", how="left", suffixes=("", "_kw"))

    # Parse JSON-like fields
    for col in ["genres", "keywords"]:
        movies[col] = movies[col].apply(_safe_literal_eval)

    def extract_names(items):
        # Items are usually list[dict{name: "..."}]
        if not isinstance(items, list):
            return []
        names = []
        for it in items:
            if isinstance(it, dict) and "name" in it:
                names.append(it["name"])
        return names

    movies["genres_str"] = movies["genres"].apply(lambda x: " ".join(extract_names(x)))
    movies["keywords_str"] = movies["keywords"].apply(lambda x: " ".join(extract_names(x)))

    # Some helpful text fields
    for col in ["overview", "tagline"]:
        if col not in movies.columns:
            movies[col] = ""
        movies[col] = movies[col].fillna("")

    # Build one combined text field
    movies["metadata_text"] = (
        movies["title"].fillna("")
        + " "
        + movies["genres_str"].fillna("")
        + " "
        + movies["keywords_str"].fillna("")
        + " "
        + movies["overview"].fillna("")
        + " "
        + movies["tagline"].fillna("")
    )

    # Keep only useful columns
    movies = movies[["id", "title", "metadata_text"]].dropna(subset=["title"])

    # Remove potential duplicates on title, keeping the first occurrence
    movies = movies.drop_duplicates(subset=["title"])

    return movies.reset_index(drop=True)


class MovieRecommender:
    """
    Content-based movie recommender using TF-IDF over movie metadata text.
    """

    def __init__(self, movies_df: pd.DataFrame):
        self.movies = movies_df.copy()
        self.vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
        self.movie_matrix = self.vectorizer.fit_transform(self.movies["metadata_text"])

        # Map lowercased title to index
        self.title_to_idx = {
            title.lower(): idx for idx, title in enumerate(self.movies["title"])
        }

    def recommend_from_likes(self, liked_titles: List[str], top_n: int = 10) -> pd.DataFrame:
        """
        Given a list of movie titles the user likes, return a DataFrame of recommended movies.
        """
        if not liked_titles:
            raise ValueError("liked_titles must contain at least one title.")

        indices = []
        for t in liked_titles:
            key = t.strip().lower()
            if key in self.title_to_idx:
                indices.append(self.title_to_idx[key])

        if not indices:
            raise ValueError(
                "None of the liked titles were found in the dataset. "
                "Check spelling or try different titles."
            )

        # User profile as average of liked movie vectors
        user_vec = np.asarray(self.movie_matrix[indices].mean(axis=0)) # Fix: Convert to numpy array

        # Similarity to all movies
        sims = cosine_similarity(user_vec, self.movie_matrix).flatten()

        # Exclude liked movies themselves
        for idx in indices:
            sims[idx] = -1.0

        top_indices = np.argsort(sims)[-top_n:][::-1]
        results = self.movies.iloc[top_indices].copy()
        results["similarity"] = sims[top_indices]
        return results[["title", "similarity"]]


def build_recommender() -> MovieRecommender:
    """
    Convenience helper to:
      1. Download the dataset (if needed)
      2. Load and prepare the movies
      3. Build and return a MovieRecommender instance
    """
    dataset_path = download_movies_dataset()
    movies_df = load_and_prepare_movies(dataset_path)
    return MovieRecommender(movies_df)


def main():
    print("Downloading and preparing movie dataset (first run may take a while)...")
    recommender = build_recommender()
    print(f"Loaded {len(recommender.movies)} movies.")

    print("\nEnter some movie titles you like, separated by commas.")
    print("Example: The Matrix, Inception, Toy Story")
    user_input = input("Your liked movies: ").strip()

    if not user_input:
        print("No titles entered. Exiting.")
        return

    liked_titles = [t.strip() for t in user_input.split(",") if t.strip()]

    try:
        recs = recommender.recommend_from_likes(liked_titles, top_n=10)
    except ValueError as e:
        print(f"Error: {e}")
        return

    print("\nBecause you like:")
    for t in liked_titles:
        print(f"  - {t}")

    print("\nYou might also like:")
    for _, row in recs.iterrows():
        print(f"  - {row['title']} (score: {row['similarity']:.3f})")


if __name__ == "__main__":
    main()


Downloading and preparing movie dataset (first run may take a while)...
Using Colab cache for faster access to the 'the-movies-dataset' dataset.
Loaded 42277 movies.

Enter some movie titles you like, separated by commas.
Example: The Matrix, Inception, Toy Story
Your liked movies: Spider-man, The Notebook, Twilight

Because you like:
  - Spider-man
  - The Notebook
  - Twilight

You might also like:
  - Spider-Man 3 (score: 0.270)
  - Earth vs. the Spider (score: 0.212)
  - Spider-Man 2 (score: 0.208)
  - The Amazing Spider-Man 2 (score: 0.206)
  - The Amazing Spider-Man (score: 0.205)
  - Arachnophobia (score: 0.194)
  - Spider-Plant Man (score: 0.189)
  - Officer Down (score: 0.159)
  - The Red Spider (score: 0.153)
  - The Great Sinner (score: 0.151)
