In [1]:
# ============================================================
# Block 0 ‚Äî Setup & Paths (READ raw, WRITE only to ARTIFACT_DIR)
# Purpose:
#   - Define file paths
#   - Define key hyperparameters (GENOME_DIM, hybrid weights, CF params)
#   - Keep raw dataset read-only
# ============================================================
import os, re, math, warnings
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.decomposition import TruncatedSVD

warnings.filterwarnings("ignore")

# ---- RAW DATA (READ-ONLY) ----
DATA_DIR = "/home/student10/large_dataset"
RATINGS_CSV = os.path.join(DATA_DIR, "ratings.csv")
MOVIES_CSV  = os.path.join(DATA_DIR, "movies.csv")
G_SCORES    = os.path.join(DATA_DIR, "genome-scores.csv")
G_TAGS      = os.path.join(DATA_DIR, "genome-tags.csv")

# ---- ARTIFACTS DIR (WRITE HERE ONLY if needed) ----
ARTIFACT_DIR = "/home/student10/Movie_Recommend"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# NOTE: movie-genres.csv (one-hot) must already exist here
M_GENRES = os.path.join(ARTIFACT_DIR, "movie-genres.csv")

# Content embedding dim for genome
GENOME_DIM = 32

# Final hybrid item-item weights
HYB_W_CONTENT = 0.3
HYB_W_CF      = 0.7

# CF filtering / AE hyperparams (used only to build CF item-item similarity)
CF_ITEM_MIN_RATINGS = 50
CF_USER_MIN_RATINGS = 20
CF_MAX_ITEMS        = 5000
CF_MAX_USERS        = 20000
CF_HIDDEN_DIM       = 256
CF_LATENT_DIM       = 64
CF_BATCH_SIZE       = 256
CF_EPOCHS           = 20

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

print("RAW DATA_DIR (read-only):", DATA_DIR)
print("ARTIFACT_DIR (write):    ", ARTIFACT_DIR)
print("M_GENRES PATH:           ", M_GENRES)
print("Files exist?",
      os.path.exists(RATINGS_CSV),
      os.path.exists(MOVIES_CSV),
      os.path.exists(M_GENRES),
      os.path.exists(G_SCORES),
      os.path.exists(G_TAGS))


RAW DATA_DIR (read-only): /home/student10/large_dataset
ARTIFACT_DIR (write):     /home/student10/Movie_Recommend
M_GENRES PATH:            /home/student10/Movie_Recommend/movie-genres.csv
Files exist? True True True True True


In [2]:
# =============================
# Block 1 ‚Äî Load CSVs (FAST, does NOT reduce data)
# =============================
movies = pd.read_csv(
    MOVIES_CSV,
    dtype={"movieId": "int64", "title": "string"}
)

genome_tags = pd.read_csv(
    G_TAGS,
    dtype={"tagId": "int64", "tag": "string"}
)

genome_scores = pd.read_csv(
    G_SCORES,
    dtype={"movieId": "int64", "tagId": "int64", "relevance": "float32"}
)

ratings = pd.read_csv(
    RATINGS_CSV,
    dtype={"userId": "int64", "movieId": "int64", "rating": "float32"}
) if os.path.exists(RATINGS_CSV) else None

# clean columns
for df in [movies, genome_tags, genome_scores, ratings]:
    if df is not None:
        df.columns = [c.strip() for c in df.columns]

# year (optional)
def extract_year(title: str):
    if isinstance(title, str):
        m = re.search(r"\((\d{4})\)", title)
        if m: return int(m.group(1))
    return np.nan

if "year" not in movies.columns:
    movies["year"] = movies["title"].astype(str).apply(extract_year)

print("movies       :", movies.shape); display(movies.head(3))
print("genome_tags  :", genome_tags.shape); display(genome_tags.head(3))
print("genome_scores:", genome_scores.shape); display(genome_scores.head(3))
if ratings is not None:
    print("ratings      :", ratings.shape); display(ratings.head(3))


movies       : (86537, 4)


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0


genome_tags  : (1128, 2)


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century


genome_scores: (18472128, 3)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.032
1,1,2,0.02225
2,1,3,0.07


ratings      : (33832162, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503


In [3]:
# =============================
# Block 1.5 ‚Äî Option A Movie Universe (movies ‚à© genres only)
# Purpose: keep max movies for CONTENT without requiring genome/ratings
# =============================
assert os.path.exists(M_GENRES), "movie-genres.csv (one-hot) not found"

genres_oh_raw = pd.read_csv(M_GENRES)
genres_oh_raw.columns = [c.strip() for c in genres_oh_raw.columns]
genres_oh_raw["movieId"] = pd.to_numeric(genres_oh_raw["movieId"], errors="coerce").astype("int64")

movie_ids_movies = set(movies["movieId"].astype(int).unique())
movie_ids_genres = set(genres_oh_raw["movieId"].astype(int).unique())

common_movie_ids = sorted(list(movie_ids_movies & movie_ids_genres))
print("Movie universe (Option A: movies ‚à© genres):", len(common_movie_ids))

# Reorder movies ONCE using this universe
movies = movies[movies["movieId"].isin(common_movie_ids)].copy()
movies = movies.set_index("movieId").loc[common_movie_ids].reset_index()

movie_ids = movies["movieId"].astype(int).to_numpy()
id2row = {int(mid): i for i, mid in enumerate(movie_ids)}


Movie universe (Option A: movies ‚à© genres): 86537


In [4]:
# =============================
# Block 2 ‚Äî Genres matrix (aligned to Option A universe)
# =============================
genres_oh = genres_oh_raw.set_index("movieId").loc[movie_ids].reset_index()
genre_cols = [c for c in genres_oh.columns if c != "movieId"]

G = genres_oh[genre_cols].astype(np.float32).to_numpy()
G = G / np.maximum(np.linalg.norm(G, axis=1, keepdims=True), 1e-8)

print("G shape:", G.shape, "| #genres columns:", len(genre_cols))
display(genres_oh.head(3))


G shape: (86537, 20) | #genres columns: 20


Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [5]:
# =============================
# Block 3 ‚Äî Genome tag matrix (KEEP ALL TAGS)
# Notes:
#   - Keeps ALL tag columns from genome_scores
#   - Movies without genome rows become all-zero rows after reindex
# =============================
gs = genome_scores[genome_scores["movieId"].isin(movie_ids)].copy()

tag_mat_df = gs.pivot(index="movieId", columns="tagId", values="relevance").fillna(0.0)
tag_mat_df = tag_mat_df.reindex(movie_ids, fill_value=0.0)

T = tag_mat_df.to_numpy(dtype=np.float32)

print("T shape:", T.shape,
      "| min/max:", float(T.min()), "/", float(T.max()),
      "| density:", f"{(T!=0).sum()/T.size*100:.2f}%")
display(tag_mat_df.iloc[:3, :10])


T shape: (86537, 1128) | min/max: 0.0 / 1.0 | density: 18.92%


tagId,1,2,3,4,5,6,7,8,9,10
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.032,0.02225,0.07,0.059,0.123,0.131,0.06175,0.1955,0.26625,0.033
2,0.0325,0.032,0.0405,0.051,0.1005,0.0635,0.19825,0.07525,0.0975,0.06325
3,0.0415,0.05525,0.02125,0.07225,0.05125,0.044,0.03,0.10375,0.023,0.0325


In [6]:
# =============================
# Block 4 ‚Äî Learn genome embedding: DAE if available, else SVD
# + RMSE on genome_scores (before/after AE)
# Inputs:
#   - T (genome tag matrix) from Block 3 (rows aligned with movies)
# Outputs:
#   - Z (movie embeddings from genome)
#   - (optional) dae, enc, T_hat, rmse metrics
# =============================
USE_TF = False
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks
    USE_TF = True
    print("TensorFlow available:", tf.__version__)
except Exception as e:
    print("TensorFlow not available -> using TruncatedSVD:", e)

if USE_TF:
    input_dim = T.shape[1]
    inp = layers.Input(shape=(input_dim,), name="tag_input")

    # Encoder
    x = layers.Dropout(0.2)(inp)
    x = layers.Dense(512, activation="relu")(x)
    z = layers.Dense(GENOME_DIM, activation="relu", name="bottleneck")(x)

    # Decoder
    x = layers.Dense(512, activation="relu")(z)
    out = layers.Dense(input_dim, activation="sigmoid", name="recon")(x)

    dae = models.Model(inp, out, name="dae")
    enc = models.Model(inp, z,  name="encoder")

    dae.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss="mse")

    # Random train/val split by movie rows (better than taking first 90%)
    rng = np.random.default_rng(42)
    idx_all = np.arange(T.shape[0])
    rng.shuffle(idx_all)
    split = int(len(idx_all) * 0.9)
    tr_idx, va_idx = idx_all[:split], idx_all[split:]
    T_train, T_val = T[tr_idx], T[va_idx]

    es = callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

    hist = dae.fit(
        T_train, T_train,
        validation_data=(T_val, T_val),
        epochs=20,
        batch_size=256,
        shuffle=True,
        callbacks=[es],
        verbose=1
    )

    # Movie embedding from encoder
    Z = enc.predict(T, batch_size=512, verbose=1).astype(np.float32)

    # Reconstruction for RMSE check
    T_hat = dae.predict(T, batch_size=512, verbose=0).astype(np.float32)

    rmse_all = float(np.sqrt(((T - T_hat) ** 2).mean()))
    mask_nz = (T != 0.0)
    if mask_nz.any():
        rmse_nz = float(np.sqrt(((T[mask_nz] - T_hat[mask_nz]) ** 2).mean()))
        print(f"üåç AE RMSE on genome_scores (ALL entries)     : {rmse_all:.4f}")
        print(f"üåç AE RMSE on genome_scores (NON-ZERO only) : {rmse_nz:.4f}")
    else:
        print(f"üåç AE RMSE on genome_scores (ALL entries): {rmse_all:.4f}")

else:
    # Fallback: SVD embedding (no AE RMSE)
    svd = TruncatedSVD(n_components=GENOME_DIM, random_state=42)
    Z = svd.fit_transform(T).astype(np.float32)
    print("Explained variance (SVD on genome_scores):",
          float(svd.explained_variance_ratio_.sum()))

print("Z shape:", Z.shape)


2025-12-25 22:52:47.558945: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-25 22:52:47.559259: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-25 22:52:47.599798: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-25 22:52:48.791825: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To tur

TensorFlow available: 2.20.0


E0000 00:00:1766677969.251469  354231 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
E0000 00:00:1766677969.281855  354231 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1766677969.283779  354231 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/20
[1m305/305[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m6s[0m 16ms/step - loss: 0.0221 - val_loss: 0.0067
Epoch 2/20
[1m305/305[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 16ms/step - loss: 0.0065 - val_loss: 0.0066
Epoch 3/20
[1m305/305[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 16ms/step - loss: 0.0065 - val_loss: 0.0066
Epoch 4/20
[1m305/305[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 16ms/step - loss: 0.0065 - val_loss: 0.0066
Epoch 5/20
[1m305/305[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 16ms/step - loss: 0.0065 - val_loss: 0.0066
[1m170/170[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 5ms/step
üåç AE RMSE on genome_scores (ALL entries)     : 0.0806
üåç AE RMSE on genome_

In [7]:
# =============================
# Block 5 ‚Äî Fuse content vectors and index (genres + genome-embedding)
# Inputs:
#   - G from Block 2 (genres vectors, normalized)
#   - Z from Block 4 (genome embedding)
# Outputs:
#   - X (final content vectors, normalized)
#   - movie_ids, id2row
# =============================
ALPHA = 0.3
BETA = 0.7
X = np.concatenate([ALPHA * G, BETA * Z], axis=1).astype(np.float32)
X = X / np.maximum(np.linalg.norm(X, axis=1, keepdims=True), 1e-8)

movie_ids = movies["movieId"].astype(int).to_numpy()
id2row = {int(mid): i for i, mid in enumerate(movie_ids)}

print("X shape:", X.shape)
display(pd.DataFrame(X[:5, :12], columns=[f"f{i}" for i in range(12)]))


X shape: (86537, 52)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11
0,0.0,0.0,0.001525,0.001525,0.001525,0.001525,0.0,0.0,0.0,0.001525,0.0,0.0
1,0.0,0.0,0.002649,0.0,0.002649,0.0,0.0,0.0,0.0,0.002649,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.004388,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.003888,0.0,0.0,0.003888,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.006362,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# =============================
# Block 6 ‚Äî Similar items + interpretability (content)
# Inputs:
#   - tag_mat_df from Block 3 (movieId √ó tagId)
#   - genome_tags from Block 1
#   - X from Block 5
# Outputs:
#   - top_tags_for_movie()
#   - similar_items_content()
# =============================
tagid_to_name = dict(zip(genome_tags["tagId"].astype(int), genome_tags["tag"].astype(str)))

def top_tags_for_movie(movie_id: int, k=10):
    movie_id = int(movie_id)
    if movie_id not in tag_mat_df.index:
        return pd.DataFrame(columns=["tagId", "tag", "relevance"])
    row = tag_mat_df.loc[movie_id]
    top = row.sort_values(ascending=False).head(k)
    return pd.DataFrame({
        "tagId": top.index.astype(int),
        "tag": [tagid_to_name.get(int(t), str(t)) for t in top.index.astype(int)],
        "relevance": top.values
    })

def similar_items_content(mid: int, topk=10):
    mid = int(mid)
    if mid not in id2row:
        return pd.DataFrame(columns=["movieId","title","content_sim"])
    i = id2row[mid]
    v = X[i]
    sims = (X @ v).astype(np.float32)
    sims[i] = -np.inf

    k = min(topk, len(sims))
    idx = np.argpartition(-sims, kth=k-1)[:k]
    idx = idx[np.argsort(sims[idx])[::-1]]

    return pd.DataFrame({
        "movieId": movies.iloc[idx]["movieId"].astype(int).values,
        "title": movies.iloc[idx]["title"].values,
        "content_sim": sims[idx]
    })

# Demo content
movie_cov = genome_scores.groupby("movieId").size().sort_values(ascending=False)
DEMO_MOVIE_ID = int(movie_cov.index[0]) if not movie_cov.empty else int(movies.iloc[0]["movieId"])
demo_title = movies.loc[movies["movieId"] == DEMO_MOVIE_ID, "title"].iloc[0]
print(f"üé¨ Demo movie: {DEMO_MOVIE_ID} ‚Äî {demo_title}")
display(top_tags_for_movie(DEMO_MOVIE_ID, 10))
display(similar_items_content(DEMO_MOVIE_ID, 10))


üé¨ Demo movie: 288167 ‚Äî Extraction 2 (2023)


Unnamed: 0,tagId,tag,relevance
0,128,betrayal,0.898
1,29,adventure,0.8905
2,195,chase,0.82675
3,481,gunfight,0.81125
4,382,fast paced,0.79525
5,389,fight scenes,0.7915
6,777,pg-13,0.777
7,19,action,0.77625
8,299,destiny,0.76175
9,646,mentor,0.7465


Unnamed: 0,movieId,title,content_sim
0,1036,Die Hard (1988),0.99999
1,192389,Venom (2018),0.999988
2,255335,Shang-Chi and the Legend of the Ten Rings (2021),0.999988
3,165831,Marvel One-Shot: All Hail the King (2014),0.999988
4,494,Executive Decision (1996),0.999988
5,111781,Mission: Impossible - Rogue Nation (2015),0.999987
6,122916,Thor: Ragnarok (2017),0.999987
7,263007,Spider-Man: No Way Home (2021),0.999987
8,195163,Bumblebee (2018),0.999986
9,2353,Enemy of the State (1998),0.999986


In [9]:
# =============================
# Block 7 ‚Äî Content-based item-item (input: movie title OR movieId)
# Notes:
#   - NO timestamp helpers (you don't use timestamp)
# Outputs:
#   - resolve_movie()
#   - similar_by_title_content()
# =============================
def resolve_movie(title_or_mid):
    """
    Resolve input (title string OR movieId int) -> (movieId, row_idx_in_movies)
    movies and movie_ids are aligned, so row_idx matches id2row[mid].
    """
    if isinstance(title_or_mid, str):
        # exact match
        rows = movies.index[movies["title"] == title_or_mid]
        if len(rows) > 0:
            row_idx = int(rows[0])
            mid = int(movies.iloc[row_idx]["movieId"])
            return mid, row_idx

        # case-insensitive contains fallback
        t = title_or_mid.lower()
        rows = movies.index[movies["title"].astype(str).str.lower().str.contains(t, na=False)]
        if len(rows) > 0:
            row_idx = int(rows[0])
            mid = int(movies.iloc[row_idx]["movieId"])
            return mid, row_idx

        return None, None
    else:
        mid = int(title_or_mid)
        row_idx = id2row.get(mid, None)
        if row_idx is None:
            return None, None
        return mid, row_idx

def similar_by_title_content(title_or_mid, topk: int = 10) -> pd.DataFrame:
    """
    Content-based item-item recommendation using X (already normalized).
    """
    mid, row_idx = resolve_movie(title_or_mid)
    if mid is None:
        print("Movie not found:", title_or_mid)
        return pd.DataFrame(columns=["movieId", "title", "content_sim"])

    print(f"üé¨ Query movie (content): {mid} ‚Äî {movies.iloc[row_idx]['title']}")

    i = id2row.get(mid, None)
    if i is None:
        return pd.DataFrame(columns=["movieId", "title", "content_sim"])

    v = X[i]
    sims = (X @ v).astype(np.float32)
    sims[i] = -np.inf

    k = min(topk, len(sims))
    idx = np.argpartition(-sims, kth=k-1)[:k]
    idx = idx[np.argsort(sims[idx])[::-1]]

    return pd.DataFrame({
        "movieId": movies.iloc[idx]["movieId"].astype(int).values,
        "title":   movies.iloc[idx]["title"].values,
        "content_sim": sims[idx],
    })

# Demo
DEMO_TITLE = "Toy Story (1995)"
display(similar_by_title_content(DEMO_TITLE, topk=10))


üé¨ Query movie (content): 1 ‚Äî Toy Story (1995)


Unnamed: 0,movieId,title,content_sim
0,4886,"Monsters, Inc. (2001)",0.999995
1,2355,"Bug's Life, A (1998)",0.999994
2,3114,Toy Story 2 (1999),0.999994
3,78499,Toy Story 3 (2010),0.999992
4,152081,Zootopia (2016),0.999991
5,2294,Antz (1998),0.999991
6,213207,Onward (2020),0.999991
7,108932,The Lego Movie (2014),0.99999
8,247988,Luca (2021),0.99999
9,166461,Moana (2016),0.99999


In [10]:
# =============================
# Block 8 ‚Äî Prepare rating data for CF autoencoder (U-AutoRec) + normalization
# Purpose:
#   - Build CF subset from ratings (still item-based hybrid later)
#   - Uses only (userId, movieId, rating) (NO timestamp)
# Outputs:
#   - cf_ratings, cf_items, cf_users
#   - normalize_rating / denormalize_rating
# =============================
CF_ITEM_MIN_RATINGS = 50
CF_USER_MIN_RATINGS = 20
CF_MAX_ITEMS        = 5000
CF_MAX_USERS        = 20000
CF_LATENT_DIM       = 64
CF_HIDDEN_DIM       = 256
CF_BATCH_SIZE       = 256
CF_EPOCHS           = 20

cf_ratings = None
cf_items   = None
cf_users   = None

R_MIN = None
R_MAX = None
R_SCALE = None

if ratings is None or ratings.empty:
    print("No ratings.csv loaded; skipping CF autoencoder.")
else:
    cf_ratings = ratings.dropna(subset=["userId", "movieId", "rating"]).copy()
    cf_ratings["userId"]  = cf_ratings["userId"].astype(int)
    cf_ratings["movieId"] = cf_ratings["movieId"].astype(int)

    # keep only movies in content universe (movie_ids)
    cf_ratings = cf_ratings[cf_ratings["movieId"].isin(movie_ids)].copy()

    item_counts = cf_ratings["movieId"].value_counts()
    user_counts = cf_ratings["userId"].value_counts()

    good_items = item_counts[item_counts >= CF_ITEM_MIN_RATINGS].index
    good_users = user_counts[user_counts >= CF_USER_MIN_RATINGS].index

    cf_ratings = cf_ratings[
        cf_ratings["movieId"].isin(good_items) &
        cf_ratings["userId"].isin(good_users)
    ].copy()

    if cf_ratings.empty:
        print("After filtering by min ratings, no data left. Lower thresholds.")
        cf_ratings = None
    else:
        top_items = cf_ratings["movieId"].value_counts().index[:CF_MAX_ITEMS]
        top_users = cf_ratings["userId"].value_counts().index[:CF_MAX_USERS]

        cf_ratings = cf_ratings[
            cf_ratings["movieId"].isin(top_items) &
            cf_ratings["userId"].isin(top_users)
        ].copy()

        cf_items = np.sort(cf_ratings["movieId"].unique())
        cf_users = np.sort(cf_ratings["userId"].unique())

        num_items_cf = len(cf_items)
        num_users_cf = len(cf_users)

        print("CF subset:")
        print("  users  :", num_users_cf)
        print("  items  :", num_items_cf)
        print("  ratings:", len(cf_ratings))
        display(cf_ratings.head())

        R_MIN = float(cf_ratings["rating"].min())
        R_MAX = float(cf_ratings["rating"].max())
        R_SCALE = max(1e-8, (R_MAX - R_MIN))
        print(f"Rating range in CF subset: {R_MIN} to {R_MAX}")

        def normalize_rating(r):
            return 0.1 + 0.9 * ((r - R_MIN) / R_SCALE)

        def denormalize_rating(rn):
            rn = np.clip(rn, 0.0, 1.0)
            return R_MIN + ((rn - 0.1) / 0.9) * R_SCALE


CF subset:
  users  : 20000
  items  : 5000
  ratings: 12919097


Unnamed: 0,userId,movieId,rating,timestamp
1538,22,16,3.5,1685231200
1539,22,18,5.0,1536148639
1540,22,32,4.5,1536427843
1541,22,47,4.5,1536155548
1542,22,70,4.5,1536694302


Rating range in CF subset: 0.5 to 5.0


In [11]:
# =============================
# Block 9 ‚Äî Build dense CF matrix R_cf (FAST, vectorized)
# Outputs:
#   - mid2col_cf, uid2row_cf
#   - R_cf (num_users_cf √ó num_items_cf)
# =============================
if cf_ratings is None:
    print("cf_ratings is None ‚Äî skipping Block 9.")
else:
    mid2col_cf = {int(mid): i for i, mid in enumerate(cf_items)}
    uid2row_cf = {int(uid): i for i, uid in enumerate(cf_users)}

    num_items_cf = len(cf_items)
    num_users_cf = len(cf_users)

    R_cf = np.zeros((num_users_cf, num_items_cf), dtype=np.float32)

    u_idx = cf_ratings["userId"].map(uid2row_cf).to_numpy(np.int32)
    i_idx = cf_ratings["movieId"].map(mid2col_cf).to_numpy(np.int32)
    vals  = normalize_rating(cf_ratings["rating"].to_numpy(np.float32))

    R_cf[u_idx, i_idx] = vals

    print("R_cf shape:", R_cf.shape,
          "| density:", f"{(R_cf>0).sum()/R_cf.size*100:.2f}%")


R_cf shape: (20000, 5000) | density: 12.92%


In [12]:
# =============================
# Block 10 ‚Äî Train/val split & generator
# Outputs:
#   - train_users, val_users
#   - cf_batch_generator()
# =============================
if cf_ratings is None:
    print("cf_ratings is None ‚Äî skipping Block 10.")
else:
    rng = np.random.default_rng(42)
    all_users_idx = np.arange(num_users_cf, dtype=np.int32)
    rng.shuffle(all_users_idx)

    split = int(0.9 * num_users_cf)
    train_users = all_users_idx[:split]
    val_users   = all_users_idx[split:]

    print(f"Train users: {len(train_users)}, Val users: {len(val_users)}")

    def cf_batch_generator(user_indices, batch_size=CF_BATCH_SIZE, shuffle=True):
        user_indices = np.array(user_indices, dtype=np.int32)
        n = len(user_indices)
        while True:
            if shuffle:
                np.random.shuffle(user_indices)
            for start in range(0, n, batch_size):
                batch = user_indices[start:start+batch_size]
                Xb = R_cf[batch]
                yield Xb, Xb


Train users: 18000, Val users: 2000


In [13]:
# =============================
# Block 11 ‚Äî Train CF autoencoder (masked MSE)
# Output:
#   - cf_ae
# =============================
if cf_ratings is None:
    print("cf_ratings is None ‚Äî skipping Block 11.")
else:
    if not USE_TF:
        try:
            import tensorflow as tf
            from tensorflow.keras import layers, models, callbacks
            USE_TF = True
            print("TensorFlow available:", tf.__version__)
        except Exception as e:
            USE_TF = False
            print("TensorFlow not available:", e)

    if not USE_TF:
        print("TensorFlow not available; cannot train CF AE.")
    else:
        import tensorflow as tf
        from tensorflow.keras import layers, models, callbacks, regularizers

        def masked_mse(y_true, y_pred):
            mask = tf.cast(tf.greater(y_true, 0.0), tf.float32)
            se = tf.square((y_true - y_pred) * mask)
            se_sum = tf.reduce_sum(se, axis=-1)
            mask_sum = tf.reduce_sum(mask, axis=-1)
            return se_sum / (mask_sum + 1e-8)

        input_dim = num_items_cf

        inp_cf = layers.Input(shape=(input_dim,), name="user_rating_vector_norm")
        h = layers.Dropout(0.5)(inp_cf)
        h = layers.Dense(CF_HIDDEN_DIM, activation="relu",
                         kernel_regularizer=regularizers.l2(1e-4))(h)
        h = layers.Dense(CF_LATENT_DIM, activation="relu",
                         kernel_regularizer=regularizers.l2(1e-4))(h)
        h = layers.Dense(CF_HIDDEN_DIM, activation="relu",
                         kernel_regularizer=regularizers.l2(1e-4))(h)
        out_cf = layers.Dense(input_dim, activation="linear")(h)

        cf_ae = models.Model(inp_cf, out_cf, name="cf_autoencoder")
        cf_ae.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=masked_mse)

        steps_per_epoch = max(1, math.ceil(len(train_users) / CF_BATCH_SIZE))
        val_steps       = max(1, math.ceil(len(val_users)   / CF_BATCH_SIZE))

        es = callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

        print("Training CF autoencoder...")
        hist_cf = cf_ae.fit(
            cf_batch_generator(train_users, batch_size=CF_BATCH_SIZE, shuffle=True),
            steps_per_epoch=steps_per_epoch,
            validation_data=cf_batch_generator(val_users, batch_size=CF_BATCH_SIZE, shuffle=False),
            validation_steps=val_steps,
            epochs=CF_EPOCHS,
            callbacks=[es],
            verbose=1
        )

        print("Final train loss:", float(hist_cf.history["loss"][-1]))
        print("Final val loss  :", float(hist_cf.history["val_loss"][-1]))


Training CF autoencoder...
Epoch 1/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.1290 - val_loss: 0.0694
Epoch 2/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0596 - val_loss: 0.0558
Epoch 3/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0486 - val_loss: 0.0493
Epoch 4/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0432 - val_loss: 0.0444
Epoch 5/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0399 - val_loss: 0.0418
Epoch 6/20
[1m71/71[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 19ms/step - loss: 0.0380 - val_loss: 0.0417
Epoch 7/20
[1m71/7

In [14]:
# =============================
# Block RMSE ‚Äî CF AutoEncoder RMSE (Observed entries only)
# Purpose:
#   - Compute RMSE between true ratings (R_cf) and predicted ratings (R_hat)
#   - Only on observed entries (where R_cf > 0)
# Notes:
#   - RMSE is computed in:
#       (1) normalized space  [0.1..1.0]
#       (2) original rating scale (after denormalize)
# Requirements:
#   - R_cf, cf_ae, denormalize_rating must exist
# Outputs:
#   - rmse_norm, rmse_orig
# =============================
import numpy as np

required = ["R_cf", "cf_ae", "denormalize_rating"]
missing = [x for x in required if x not in globals()]
if missing:
    print("‚ö†Ô∏è Missing required variables for RMSE block:", missing)
else:
    # Predict full matrix if not already done
    if "R_hat" not in globals():
        print("R_hat not found ‚Üí predicting full reconstruction...")
        R_hat = cf_ae.predict(R_cf, batch_size=256, verbose=1).astype(np.float32)
    else:
        print("Using existing R_hat with shape:", R_hat.shape)

    # Mask observed ratings
    mask_obs = (R_cf > 0.0)
    n_obs = int(mask_obs.sum())
    print("Observed ratings count:", n_obs)

    if n_obs == 0:
        print("‚ö†Ô∏è No observed ratings found in R_cf (>0). Cannot compute RMSE.")
    else:
        # RMSE in normalized space
        diff2_norm = (R_cf[mask_obs] - R_hat[mask_obs]) ** 2
        rmse_norm = float(np.sqrt(diff2_norm.mean()))

        # RMSE in original rating scale
        true_orig = denormalize_rating(R_cf[mask_obs])
        pred_orig = denormalize_rating(R_hat[mask_obs])
        diff2_orig = (true_orig - pred_orig) ** 2
        rmse_orig = float(np.sqrt(diff2_orig.mean()))

        print(f"üåç RMSE (normalized, observed only): {rmse_norm:.4f}")
        print(f"üåç RMSE (original scale, observed only): {rmse_orig:.4f}")


R_hat not found ‚Üí predicting full reconstruction...
[1m79/79[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 11ms/step
Observed ratings count: 12919097
üåç RMSE (normalized, observed only): 0.1634
üåç RMSE (original scale, observed only): 0.8168


In [15]:
# =============================
# Block 12 ‚Äî Build CF item embedding (ITEM_EMB_CF) + mapping for hybrid
# Outputs:
#   - R_hat, ITEM_EMB_CF
#   - cf_item_rows_valid, cf_valid
# =============================
if cf_ratings is None or (not USE_TF) or ("cf_ae" not in globals()):
    ITEM_EMB_CF = None
    print("CF AE not ready ‚Äî hybrid will be content-only.")
else:
    print("Predicting full reconstruction R_hat...")
    R_hat = cf_ae.predict(R_cf, batch_size=256, verbose=1).astype(np.float32)

    ITEM_EMB_CF = R_hat.T.astype(np.float32)
    ITEM_EMB_CF = ITEM_EMB_CF / np.maximum(np.linalg.norm(ITEM_EMB_CF, axis=1, keepdims=True), 1e-8)

    cf_item_rows = np.array([id2row.get(int(mid), -1) for mid in cf_items], dtype=np.int32)
    cf_valid = cf_item_rows >= 0
    cf_item_rows_valid = cf_item_rows[cf_valid]

    print("ITEM_EMB_CF shape:", ITEM_EMB_CF.shape)


Predicting full reconstruction R_hat...
[1m79/79[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 11ms/step
ITEM_EMB_CF shape: (5000, 20000)


In [16]:
# =============================
# Block 13 ‚Äî Hybrid ITEM-based recommendation (Content + CF)
# Key:
#   - CF scores for movies not in CF subset are NaN (NOT 0)
#   - Normalization ignores NaN
# Output:
#   - hybrid_similar_by_movie()
# =============================
def _minmax_norm(arr: np.ndarray) -> np.ndarray:
    out = np.zeros_like(arr, dtype=np.float32)
    mask = np.isfinite(arr)
    if not mask.any():
        return out
    v = arr[mask]
    vmin, vmax = float(v.min()), float(v.max())
    if vmax - vmin < 1e-8:
        out[mask] = 0.0
        return out
    out[mask] = (arr[mask] - vmin) / (vmax - vmin)
    return out

def hybrid_similar_by_movie(title_or_mid, topk=10, w_content=ALPHA, w_cf=BETA):
    # resolve
    mid, row_idx = resolve_movie(title_or_mid)
    if mid is None:
        print("Movie not found:", title_or_mid)
        return pd.DataFrame()

    i = id2row[mid]
    print(f"üé¨ Query movie (hybrid): {mid} ‚Äî {movies.iloc[i]['title']}")

    # content similarity
    v = X[i]
    content_scores = (X @ v).astype(np.float32)
    content_scores[i] = np.nan

    # CF similarity (NaN for missing)
    cf_scores_full = np.full(len(movies), np.nan, dtype=np.float32)
    if ITEM_EMB_CF is not None and mid in mid2col_cf:
        j = mid2col_cf[mid]
        v_cf = ITEM_EMB_CF[j]
        sim_cf_items = (ITEM_EMB_CF @ v_cf).astype(np.float32)
        cf_scores_full[cf_item_rows_valid] = sim_cf_items[cf_valid]
        cf_scores_full[i] = np.nan

    # normalize + hybrid
    content_norm = _minmax_norm(content_scores)
    cf_norm      = _minmax_norm(cf_scores_full)

    hybrid = w_content * content_norm + w_cf * cf_norm
    hybrid[i] = -np.inf

    k = min(topk, len(hybrid))
    idx = np.argpartition(-hybrid, kth=k-1)[:k]
    idx = idx[np.argsort(hybrid[idx])[::-1]]

    return pd.DataFrame({
        "movieId": movies.iloc[idx]["movieId"].astype(int).values,
        "title":   movies.iloc[idx]["title"].values,
        "hybrid_score": hybrid[idx],
        "content_norm": content_norm[idx],
        "cf_norm":      cf_norm[idx],
    })

# DEMO
DEMO_TITLE = "Toy Story (1995)"
display(hybrid_similar_by_movie(DEMO_TITLE, topk=10, w_content=ALPHA, w_cf=BETA))


üé¨ Query movie (hybrid): 1 ‚Äî Toy Story (1995)


Unnamed: 0,movieId,title,hybrid_score,content_norm,cf_norm
0,3114,Toy Story 2 (1999),0.999999,0.999996,1.0
1,78499,Toy Story 3 (2010),0.999063,0.999987,0.998667
2,50872,Ratatouille (2007),0.995675,0.999948,0.993844
3,6377,Finding Nemo (2003),0.995346,0.999897,0.993396
4,201588,Toy Story 4 (2019),0.995124,0.999976,0.993044
5,8961,"Incredibles, The (2004)",0.995091,0.99993,0.993017
6,4886,"Monsters, Inc. (2001)",0.995074,1.0,0.992963
7,6863,School of Rock (2003),0.994466,0.999869,0.992151
8,68954,Up (2009),0.994382,0.999947,0.991998
9,2640,Superman (1978),0.993437,0.999775,0.99072


In [17]:
# ===== Export inference artifacts (robust) =====
import os, joblib
import numpy as np

# If ARTIFACT_DIR wasn't defined (e.g., you restarted kernel), define it here:
ARTIFACT_DIR = globals().get("ARTIFACT_DIR", "/home/student10/Movie_Recommend")
os.makedirs(ARTIFACT_DIR, exist_ok=True)

ART_PATH = os.path.join(ARTIFACT_DIR, "hybrid_reco_artifacts.joblib")

# Safety: allow CF to be missing
_mid2col_cf = mid2col_cf if "mid2col_cf" in globals() else None
_cf_item_rows_valid = cf_item_rows_valid if "cf_item_rows_valid" in globals() else None
_cf_valid = cf_valid if "cf_valid" in globals() else None

# Make sure required variables exist
required = ["movies", "X", "ALPHA", "BETA"]
missing = [v for v in required if v not in globals()]
if missing:
    raise NameError(f"Missing variables (run previous blocks first): {missing}")

artifacts = {
    # universe
    "movie_ids": movies["movieId"].astype(np.int64).to_numpy(),
    "titles": movies["title"].astype(str).to_list(),

    # content vectors
    "X": X.astype(np.float32),

    # default hybrid weights
    "w_content": float(ALPHA),
    "w_cf": float(BETA),

    # CF optional
    "ITEM_EMB_CF": None if "ITEM_EMB_CF" not in globals() or ITEM_EMB_CF is None else ITEM_EMB_CF.astype(np.float32),
    "mid2col_cf": _mid2col_cf,
    "cf_item_rows_valid": None if _cf_item_rows_valid is None else _cf_item_rows_valid.astype(np.int32),
    "cf_valid": _cf_valid,
}

joblib.dump(artifacts, ART_PATH, compress=3)
print("‚úÖ Saved artifacts to:", ART_PATH)
print("   X shape:", artifacts["X"].shape)
print("   CF ready?:", artifacts["ITEM_EMB_CF"] is not None)


‚úÖ Saved artifacts to: /home/student10/Movie_Recommend/hybrid_reco_artifacts.joblib
   X shape: (86537, 52)
   CF ready?: True


In [18]:
pip install fastapi uvicorn numpy pandas joblib

Collecting fastapi
  Downloading fastapi-0.127.0-py3-none-any.whl.metadata (30 kB)
Collecting uvicorn
  Downloading uvicorn-0.40.0-py3-none-any.whl.metadata (6.7 kB)
Collecting starlette<0.51.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.50.0-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic>=2.7.0 (from fastapi)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting annotated-doc>=0.0.2 (from fastapi)
  Downloading annotated_doc-0.0.4-py3-none-any.whl.metadata (6.6 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.7.0->fastapi)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.41.5 (from pydantic>=2.7.0->fastapi)
  Downloading pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting typing-inspection>=0.4.2 (from pydantic>=2.7.0->fastapi)
  Using cached typing_inspection-0.4.2-py3-none-any.whl.metadata (2.6 kB)
Downloading fastapi-0.127.0-py3-none-