In [2]:
# === Install lightweight deps (few minutes on first run) ===
!pip -q install pandas numpy scipy scikit-learn gradio

# === Imports ===
import re, zipfile, urllib.request, os, gc, numpy as np, pandas as pd
from pathlib import Path
from typing import Optional, Tuple
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# -----------------------------
# Dataset loader: MovieLens 25M
# (robust to nested folders, filters years 2015–2025)
# -----------------------------
CACHE_DIR = Path.home() / ".cache" / "recsys25m"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
ML25M_URL = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
YEAR_RE = re.compile(r"\((\d{4})\)$")

def _download_ml25m() -> Path:
    zpath = CACHE_DIR / "ml-25m.zip"
    if not zpath.exists():
        print("Downloading MovieLens 25M (~250MB)…")
        urllib.request.urlretrieve(ML25M_URL, zpath)
    return zpath

def _extract_root_containing_csvs() -> Path:
    # Find a folder that has movies.csv and ratings.csv
    candidates = []
    for p in CACHE_DIR.rglob("movies.csv"):
        cand = p.parent
        if (cand / "ratings.csv").exists():
            candidates.append(cand)
    if candidates:
        return sorted(candidates, key=lambda p: len(str(p)))[0]

    # Not extracted yet → extract zip to CACHE_DIR
    with zipfile.ZipFile(_download_ml25m(), "r") as z:
        z.extractall(CACHE_DIR)
    # Search again
    for p in CACHE_DIR.rglob("movies.csv"):
        cand = p.parent
        if (cand / "ratings.csv").exists():
            candidates.append(cand)
    if not candidates:
        raise FileNotFoundError("Could not find movies.csv/ratings.csv after extraction.")
    return sorted(candidates, key=lambda p: len(str(p)))[0]

def _year_from_title(title: str) -> Optional[int]:
    m = YEAR_RE.search(title or "")
    return int(m.group(1)) if m else None

def load_ml25m_2015_2025(
    year_min=2015, year_max=2025,
    min_user_ratings=10,
    max_users=50_000,              # cap most active users (speed)
    fast_mode_max_ratings=2_000_000 # optional downsample (speed)
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    root = _extract_root_containing_csvs()

    movies = pd.read_csv(root / "movies.csv", usecols=["movieId","title","genres"])
    movies["year"] = movies["title"].map(_year_from_title)
    movies = movies.dropna(subset=["year"]).astype({"year":"int16"})
    movies = movies[(movies["year"] >= year_min) & (movies["year"] <= year_max)]
    movies = movies.rename(columns={"movieId":"item_id"})

    ratings = pd.read_csv(root / "ratings.csv",
                          usecols=["userId","movieId","rating","timestamp"],
                          dtype={"userId":"int32","movieId":"int32","rating":"float32","timestamp":"int64"})
    ratings = ratings.rename(columns={"userId":"user_id","movieId":"item_id"})
    ratings = ratings[ratings["item_id"].isin(movies["item_id"])]

    # keep active users (after year filter)
    user_counts = ratings["user_id"].value_counts()
    active = user_counts[user_counts >= min_user_ratings].index
    ratings = ratings[ratings["user_id"].isin(active)]

    # cap top most-active users for speed (optional)
    if max_users is not None and len(active) > max_users:
        top_users = user_counts.loc[active].nlargest(max_users).index
        ratings = ratings[ratings["user_id"].isin(top_users)]

    # optional random downsample for speed
    if fast_mode_max_ratings is not None and len(ratings) > fast_mode_max_ratings:
        ratings = ratings.sample(fast_mode_max_ratings, random_state=42)

    # keep only movies that still have ratings
    movies = movies[movies["item_id"].isin(ratings["item_id"].unique())]

    print(f"[ML-25M] After filters: ratings={len(ratings):,}, users={ratings.user_id.nunique():,}, movies={len(movies):,}")
    return ratings[["user_id","item_id","rating","timestamp"]], movies[["item_id","title","year","genres"]]

# -----------------------------
# Recommender: TruncatedSVD MF
# -----------------------------
class MFRecommender:
    def __init__(self, n_components=100, random_state=42):
        self.svd = TruncatedSVD(n_components=n_components, random_state=random_state)
        self.global_mean = 0.0
        self.user_index = {}
        self.item_index = {}
        self.index_user = {}
        self.index_item = {}
        self.U = None
        self.V = None

    def _build_maps(self, ratings: pd.DataFrame):
        users = ratings["user_id"].astype(int).unique()
        items = ratings["item_id"].astype(int).unique()
        self.user_index = {u:i for i,u in enumerate(sorted(users))}
        self.item_index = {m:i for i,m in enumerate(sorted(items))}
        self.index_user = {i:u for u,i in self.user_index.items()}
        self.index_item = {i:m for m,i in self.item_index.items()}

    def _to_sparse(self, ratings: pd.DataFrame) -> csr_matrix:
        rows = ratings["user_id"].map(self.user_index).to_numpy()
        cols = ratings["item_id"].map(self.item_index).to_numpy()
        data = ratings["rating"].astype(float).to_numpy()
        return csr_matrix((data, (rows, cols)), shape=(len(self.user_index), len(self.item_index)))

    def fit(self, ratings: pd.DataFrame):
        self._build_maps(ratings)
        R = self._to_sparse(ratings).astype(np.float32)
        self.global_mean = (R.data.mean() if R.nnz > 0 else 0.0)
        R = R.copy()
        if R.nnz > 0:
            R.data = R.data - self.global_mean
        self.svd.fit(R)
        V = self.svd.components_.T   # items x k
        U = R @ V                    # users x k
        # normalize for stability
        V = V / (np.linalg.norm(V, axis=1, keepdims=True) + 1e-8)
        U = U / (np.linalg.norm(U, axis=1, keepdims=True) + 1e-8)
        self.V, self.U = V, U
        return self

    def recommend_from_liked(self, liked_item_ids, topk=10, exclude_item_ids=None):
        liked_idx = [self.item_index[i] for i in liked_item_ids if i in self.item_index]
        if not liked_idx:
            scores = np.zeros(len(self.index_item)) + self.global_mean
            order = np.argsort(-scores)
        else:
            u_vec = self.V[liked_idx, :].mean(axis=0)
            u_vec = u_vec / (np.linalg.norm(u_vec) + 1e-8)
            scores = u_vec @ self.V.T + self.global_mean
            order = np.argsort(-scores)
        exclude = set([self.item_index[i] for i in (exclude_item_ids or []) if i in self.item_index])
        order = [i for i in order if i not in exclude]
        top = order[:topk]
        return [ (self.index_item[i], float(scores[i])) for i in top ]

# -----------------------------
# Train on 2015–2025 subset
# -----------------------------
ratings, movies = load_ml25m_2015_2025(
    year_min=2015, year_max=2025,
    min_user_ratings=10,
    max_users=50_000,              # you may lower to 30_000 if RAM is tight
    fast_mode_max_ratings=2_000_000
)

model = MFRecommender(n_components=100, random_state=42).fit(ratings)

# Helper: quick title lookup
def find_titles(q, k=10):
    q = q.strip().lower()
    if not q: return []
    return movies[movies["title"].str.lower().str.contains(q)].head(k)["title"].tolist()

# Quick smoke test (change titles as you like)
sample_titles = ["Mad Max: Fury Road (2015)", "La La Land (2016)", "Spider-Man: Into the Spider-Verse (2018)"]
liked_ids = movies[movies["title"].isin(sample_titles)]["item_id"].tolist()
recs = model.recommend_from_liked(liked_ids, topk=10, exclude_item_ids=liked_ids)
out = pd.DataFrame(recs, columns=["item_id","score"]).merge(movies, on="item_id", how="left")[["title","year","score"]]
print("\nSample recommendations for:", sample_titles, "\n")
print(out.to_string(index=False))

gc.collect();


Downloading MovieLens 25M (~250MB)…
[ML-25M] After filters: ratings=736,601, users=16,939, movies=10,187

Sample recommendations for: ['Mad Max: Fury Road (2015)', 'La La Land (2016)', 'Spider-Man: Into the Spider-Verse (2018)'] 

                                                   title  year    score
                                        Excursion (2018)  2018 3.907298
                                         Stickman (2017)  2017 3.889499
                My Extraordinary Summer with Tess (2019)  2019 3.827049
              Keith Richards: Under the Influence (2015)  2015 3.822817
       Digimon Adventure Tri. - Chapter 6: Future (2018)  2018 3.818893
Digimon Adventure Tri. - Chapter 2: Determination (2016)  2016 3.818892
                              Birthday Wonderland (2019)  2019 3.818892
  Digimon Adventure Tri. - Chapter 5: Coexistence (2017)  2017 3.818892
              Lupin the Third: Lie of Fujiko Mine (2019)  2019 3.818887
                              The Relative Worlds

In [3]:
import gradio as gr
import pandas as pd

# Pick top-N most-rated modern movies as choices (keeps dropdown snappy)
topN = 1000
top_movies = (ratings.groupby("item_id").size()
              .sort_values(ascending=False).head(topN).index.tolist())
choices = (movies[movies["item_id"].isin(top_movies)]
           .sort_values("title")["title"].tolist())

def recommend_ui(selected_titles, topk):
    if not selected_titles:
        return pd.DataFrame(columns=["title","year","score"])
    liked_ids = movies[movies["title"].isin(selected_titles)]["item_id"].tolist()
    recs = model.recommend_from_liked(liked_ids, topk=topk, exclude_item_ids=liked_ids)
    df = pd.DataFrame(recs, columns=["item_id","score"]).merge(
        movies, on="item_id", how="left")[["title","year","score"]]
    return df

demo = gr.Interface(
    fn=recommend_ui,
    inputs=[
        gr.Dropdown(choices=choices, label="Pick a few movies you like (2015–2025)", multiselect=True),
        gr.Slider(5, 30, value=10, step=1, label="Top-K")
    ],
    outputs=gr.Dataframe(label="Recommendations"),
    title="🎬 Movie Recommender (2015–2025, MovieLens 25M)",
    description="Select a few modern movies; get similar recommendations. Model: MF with TruncatedSVD."
)

# share=True gives you a public URL; queue() handles concurrent clicks nicely
demo.queue().launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2ccf3e97c6feb71575.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


