<a href="https://colab.research.google.com/github/ArchanaMahto/movie-recommendation-system/blob/main/Untitled35.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def _safe_json_loads(x):
    if pd.isna(x):
        return []
    if isinstance(x, (list, dict)):
        return x
    try:
        return json.loads(x)
    except Exception:
        return []


def _get_names(json_str, key="name", top_n=None):
    items = _safe_json_loads(json_str)
    names = []
    for it in items:
        if isinstance(it, dict) and key in it and it[key]:
            names.append(str(it[key]))
    if top_n is not None:
        names = names[:top_n]
    return names


def _get_director(crew_json):
    crew = _safe_json_loads(crew_json)
    for person in crew:
        if isinstance(person, dict) and person.get("job") == "Director":
            return person.get("name", "")
    return ""


def _normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


class ContentBasedRecommender:
    def __init__(self):
        self.df = None
        self.title_to_idx = None
        self.tfidf = None
        self.sim = None

    def fit(self, movies_csv: str, credits_csv: str):
        movies = pd.read_csv(movies_csv)
        credits = pd.read_csv(credits_csv)

        # Join datasets (TMDB 5000 movies + credits) [web:2]
        df = movies.merge(credits, on="title", how="inner")

        # Extract structured fields
        df["genres"] = df["genres"].apply(lambda x: _get_names(x))
        df["keywords"] = df["keywords"].apply(lambda x: _get_names(x))
        df["cast"] = df["cast"].apply(lambda x: _get_names(x, top_n=5))
        df["director"] = df["crew"].apply(_get_director)

        # Build tags text
        df["overview"] = df["overview"].fillna("")
        df["tags"] = (
            df["overview"].astype(str)
            + " "
            + df["genres"].apply(lambda xs: " ".join(xs))
            + " "
            + df["keywords"].apply(lambda xs: " ".join(xs))
            + " "
            + df["cast"].apply(lambda xs: " ".join(xs))
            + " "
            + df["director"].fillna("")
        )

        df["tags"] = df["tags"].apply(_normalize_text)

        # Keep only what we need
        df = df[["movie_id", "title", "tags"]].drop_duplicates("title").reset_index(drop=True)

        # Vectorize + similarity
        self.tfidf = TfidfVectorizer(stop_words="english", max_features=50000, ngram_range=(1, 2))
        X = self.tfidf.fit_transform(df["tags"])

        # Cosine similarity matrix (dense) - OK for ~5k movies [web:3]
        self.sim = cosine_similarity(X, X)

        self.df = df
        self.title_to_idx = {t.lower(): i for i, t in enumerate(df["title"].tolist())}
        return self

    def recommend(self, title: str, top_n: int = 10):
        if self.df is None or self.sim is None:
            raise RuntimeError("Model not fitted. Call fit(...) first.")

        key = title.lower().strip()
        if key not in self.title_to_idx:
            # simple fallback: contains match
            candidates = [t for t in self.title_to_idx.keys() if key in t]
            if not candidates:
                raise ValueError(f"Title not found: {title}")
            key = candidates[0]

        idx = self.title_to_idx[key]
        scores = list(enumerate(self.sim[idx]))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        # skip itself (first item)
        recs = []
        for i, score in scores[1: top_n + 1]:
            recs.append((self.df.loc[i, "title"], float(score)))
        return recs


def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--movies", default="data/tmdb_5000_movies.csv")
    parser.add_argument("--credits", default="data/tmdb_5000_credits.csv")
    parser.add_argument("--title", required=True)
    parser.add_argument("--topn", type=int, default=10)
    args = parser.parse_args()

    model = ContentBasedRecommender().fit(args.movies, args.credits)
    recs = model.recommend(args.title, top_n=args.topn)

    print(f"\nBecause you liked: {args.title}\n")
    for rank, (t, s) in enumerate(recs, start=1):
        print(f"{rank:02d}. {t} (similarity={s:.3f})")


if __name__ == "__main__":
    main()


usage: colab_kernel_launcher.py [-h] [--movies MOVIES] [--credits CREDITS]
                                --title TITLE [--topn TOPN]
colab_kernel_launcher.py: error: the following arguments are required: --title
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3.12/argparse.py", line 1943, in _parse_known_args2
    namespace, args = self._parse_known_args(args, namespace, intermixed)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 2230, in _parse_known_args
    raise ArgumentError(None, _('the following arguments are required: %s') %
argparse.ArgumentError: the following arguments are required: --title

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-1237563816.py", line 139, in <cell line: 0>
    main()
  File "/tmp/ipython-input-1237563816.py", line 128, in main
    args = parser.parse_args()
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 190

TypeError: object of type 'NoneType' has no len()