# Config & Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import sys
import json
from math import log2
import hashlib
import random
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import math
from tqdm import tqdm
from collections import defaultdict
import torch.utils.data as data

CFG={
  "data_root":"/content/drive/My Drive/JustiFlicks/data_processed",
  "embedding_root":"/content/drive/My Drive/JustiFlicks/embeddings",
  "artifact_root":"/content/drive/My Drive/JustiFlicks/artifacts",
  "rating_threshold":4.5,
  "k_values":[5,10,20],
  "seed":34,
  "n_test":5,
  "n_val":5,
  "min_pos":1,
  "reproducibility":{
    "seeds":[34,35,36],
    "sample_frac":0.1,
    "min_users":10000,
    "max_users":50000,
    "k":10
  }
}

random.seed(CFG["seed"])
np.random.seed(CFG["seed"])
torch.manual_seed(CFG["seed"])
torch.cuda.manual_seed_all(CFG["seed"])
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=False

print("Python version:",sys.version)
print("NumPy version:",np.__version__)
print("Pandas version:",pd.__version__)
print("PyTorch version:",torch.__version__)
print("CUDA available:",torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:",torch.cuda.get_device_name(0))

for subdir in ["splits","mf","lightfm","neumf","lightgcn","comparision"]:
    os.makedirs(os.path.join(CFG["artifact_root"],subdir),exist_ok=True)

with open(os.path.join(CFG["artifact_root"],"config.json"),"w") as f:
    json.dump(CFG,f,indent=2)

print("Artifacts root:",CFG["artifact_root"])
print("Config saved to:",os.path.join(CFG["artifact_root"],"config.json"))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
NumPy version: 2.0.2
Pandas version: 2.2.2
PyTorch version: 2.9.0+cpu
CUDA available: False
Artifacts root: /content/drive/My Drive/JustiFlicks/artifacts
Config saved to: /content/drive/My Drive/JustiFlicks/artifacts/config.json


In [None]:
!pip install wandb==0.23.0

Collecting wandb==0.23.0
  Downloading wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.23.1
    Uninstalling wandb-0.23.1:
      Successfully uninstalled wandb-0.23.1
Successfully installed wandb-0.23.0


In [None]:
!pip install implicit

# Splits & Eval

## Splits

In [None]:
import wandb

k_values=CFG["k_values"]
rating_threshold=CFG["rating_threshold"]
artifact_root=CFG["artifact_root"]
splits_dir=os.path.join(artifact_root,"splits")
os.makedirs(splits_dir,exist_ok=True)
n_test=CFG["n_test"]
n_val=CFG["n_val"]
min_pos=CFG["min_pos"]

ratings=pd.read_parquet(os.path.join(CFG["data_root"],"movieRatings.parquet"))
ratings=ratings.sort_values(["userId","datetime"])

g=ratings.groupby("userId",group_keys=False)
ratings["rank_from_end"]=g.cumcount(ascending=False)
ratings["user_count"]=g["userId"].transform("size")

train=ratings[(ratings.user_count>10)&(ratings.rank_from_end>=(n_val+n_test))]
val=ratings[(ratings.user_count>10)&(ratings.rank_from_end<(n_val+n_test))&(ratings.rank_from_end>=n_test)]
test=ratings[(ratings.user_count>10)&(ratings.rank_from_end<n_test)]

train=train.drop(columns=["rank_from_end","user_count"])
val=val.drop(columns=["rank_from_end","user_count"])
test=test.drop(columns=["rank_from_end","user_count"])

train_path=os.path.join(splits_dir,"train.parquet")
val_path=os.path.join(splits_dir,"val.parquet")
test_path=os.path.join(splits_dir,"test.parquet")

train.to_parquet(train_path,index=False)
val.to_parquet(val_path,index=False)
test.to_parquet(test_path,index=False)

train_sample=train.sample(n=min(10000,len(train)),random_state=CFG["seed"])
val_sample=val.sample(n=min(10000,len(val)),random_state=CFG["seed"])
test_sample=test.sample(n=min(10000,len(test)),random_state=CFG["seed"])

train_sample_path=os.path.join(splits_dir,"train_sample.csv")
val_sample_path=os.path.join(splits_dir,"val_sample.csv")
test_sample_path=os.path.join(splits_dir,"test_sample.csv")

train_sample.to_csv(train_sample_path,index=False)
val_sample.to_csv(val_sample_path,index=False)
test_sample.to_csv(test_sample_path,index=False)

train_pos=train[train.rating>=rating_threshold]
support=train_pos.groupby("movieId").size().reset_index(name="support")
links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))

support_imdb=support.merge(links[["movieId","imdbId"]],on="movieId",how="left")
support_imdb=support_imdb.groupby("imdbId",dropna=False)["support"].sum().reset_index()

movie_ids_per_imdb=links.groupby("imdbId")["movieId"].first().reset_index()

movie_data=pd.read_parquet(os.path.join(CFG["data_root"],"movieData.parquet"))

item_support_table=movie_data[["imdbId","release_year","original_language","num_votes_imdb"]]\
    .drop_duplicates("imdbId")\
    .merge(movie_ids_per_imdb,on="imdbId",how="left")\
    .merge(support_imdb,on="imdbId",how="left")\
    .fillna({"support":0})

item_support_table["support"]=item_support_table["support"].astype(int)

def support_bin(x):
    if x==0:return "0"
    if x<5:return "1-4"
    if x<20:return "5-19"
    return "20+"

def vote_bin(x):
    if x<=0:return "0"
    if x<10:return "1-9"
    if x<100:return "10-99"
    if x<1000:return "100-999"
    return "1000+"

def era_bin(y):
    if y<1970:return "1900-1969"
    if y<1980:return "1970-1979"
    if y<1990:return "1980-1989"
    if y<2000:return "1990-1999"
    if y<2010:return "2000-2009"
    if y<2020:return "2010-2019"
    return "2020-2029"

item_support_table["support_bin"]=item_support_table.support.apply(support_bin)
item_support_table["imdb_vote_bin"]=item_support_table.num_votes_imdb.apply(vote_bin)
item_support_table["era_bin"]=item_support_table.release_year.apply(era_bin)

item_support_path=os.path.join(splits_dir,"item_support_table.parquet")
item_support_table.to_parquet(item_support_path,index=False)

train_pos_initial=train[train.rating>=rating_threshold]
users_before=int(train.userId.nunique())
user_pos_counts=train_pos_initial.groupby("userId").size()
eligible_users=user_pos_counts[user_pos_counts>=min_pos].index
users_after=int(len(eligible_users))
users_removed=int(users_before-users_after)

splits_manifest={
    "train_rows":int(len(train)),
    "val_rows":int(len(val)),
    "test_rows":int(len(test)),
    "unique_items_train":int(train.movieId.nunique()),
    "rating_threshold":rating_threshold,
    "train_eligibility": {
      "n_items": 87,585,
      "full_catalog": 424552
    },
    "cf_eligibility":{
        "criterion":"min_positive_in_train",
        "min_pos":min_pos,
        "users_before":users_before,
        "users_after":users_after,
        "users_removed":users_removed
    }

}

manifest_path=os.path.join(splits_dir,"splits_manifest.json")
with open(manifest_path,"w") as f:
    json.dump(splits_manifest,f,indent=2)

wandb.login()
run=wandb.init(project="JustiFlicks",job_type="model",name="01_splits",reinit=True)

artifact=wandb.Artifact(name="dataSplits",type="dataset",metadata=json.loads(json.dumps(splits_manifest)))
artifact.add_file(train_sample_path)
artifact.add_file(val_sample_path)
artifact.add_file(test_sample_path)
artifact.add_file(item_support_path)
artifact.add_file(manifest_path)

run.log_artifact(artifact)
artifact.wait()
run.finish()

In [None]:
train=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train.parquet"))
val=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val.parquet"))
test=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/test.parquet"))

splits_dir=os.path.join(CFG["artifact_root"],"splits")
os.makedirs(splits_dir,exist_ok=True)

rng=np.random.RandomState(CFG["seed"])

users=train.userId.unique()
n_small=len(users)//5
small_users=set(rng.choice(users,size=n_small,replace=False))

train_small=train[train.userId.isin(small_users)].copy()
val_small=val[val.userId.isin(small_users)].copy()
test_small=test[test.userId.isin(small_users)].copy()

assert train_small.merge(val_small,on=["userId","movieId"],how="inner").empty
assert train_small.merge(test_small,on=["userId","movieId"],how="inner").empty
assert val_small.merge(test_small,on=["userId","movieId"],how="inner").empty

train_small=train_small.sort_values(["userId","datetime"])
val_small=val_small.sort_values(["userId","datetime"])
test_small=test_small.sort_values(["userId","datetime"])

train_small.to_parquet(os.path.join(splits_dir,"train_small.parquet"),index=False)
val_small.to_parquet(os.path.join(splits_dir,"val_small.parquet"),index=False)
test_small.to_parquet(os.path.join(splits_dir,"test_small.parquet"),index=False)

print(len(train_small),len(val_small),len(test_small))
print("unique users:",train_small.userId.nunique())

5990665 200945 200945
unique users: 40189


## Eval

In [None]:
train=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train.parquet"))
val=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val.parquet"))
test=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/test.parquet"))
item_support_table=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/item_support_table.parquet"))

with open(os.path.join(CFG["artifact_root"],"splits/splits_manifest.json")) as f:
    splits_manifest=json.load(f)

def _relevance_from_rating(r):
    if r is None:
        return 0.0
    try:
        r=float(r)
    except Exception:
        return 0.0
    if r>=5.0:
        return 1
    if r>=4.5:
        return 1
    if r>=4.0:
        return 0
    return 0.0

def _item_relevance(item,gt):
    if isinstance(gt,dict):
        return _relevance_from_rating(gt.get(item,None))
    if isinstance(gt,set):
        return 1.0 if item in gt else 0.0
    return 0.0

def ndcg_at_k_graded(pred,gt,k):
    if (isinstance(gt,set) and len(gt)==0) or (isinstance(gt,dict) and len(gt)==0):
        return None
    dcg=0.0
    for i,item in enumerate(pred[:k],start=1):
        rel=_item_relevance(item,gt)
        if rel>0:
            dcg+=rel/log2(i+1)
    if isinstance(gt,dict):
        rels=[_relevance_from_rating(r) for r in gt.values()]
    else:
        rels=[1.0 for _ in gt]
    rels_sorted=sorted(rels,reverse=True)
    idcg=sum(v/log2(i+1) for i,v in enumerate(rels_sorted[:k],start=1))
    return dcg/idcg if idcg>0 else 0.0

def recall_at_k_binary(pred,gt,k,threshold=5.0):
    if isinstance(gt,dict):
        gt_set={i for i,r in gt.items() if r>=threshold}
    elif isinstance(gt,set):
        gt_set=set(gt)
    else:
        gt_set=set()
    if len(gt_set)==0:
        return None
    hits=sum(1 for i in pred[:k] if i in gt_set)
    return hits/len(gt_set)

def map_at_k_binary(pred,gt,k,threshold=5.0):
    if isinstance(gt,dict):
        gt_set={i for i,r in gt.items() if r>=threshold}
    elif isinstance(gt,set):
        gt_set=set(gt)
    else:
        gt_set=set()
    if len(gt_set)==0:
        return None
    hits=0
    s=0.0
    for i,item in enumerate(pred[:k],start=1):
        if item in gt_set:
            hits+=1
            s+=hits/i
    return s/min(len(gt_set),k)

def _build_popularity_map(item_support_df):
    pop_map={}
    if "movieId" in item_support_df.columns and "num_votes_imdb" in item_support_df.columns:
        pop_map=dict(zip(item_support_df["movieId"].tolist(),item_support_df["num_votes_imdb"].fillna(0).astype(float).tolist()))
        return pop_map
    if "imdbId" in item_support_df.columns and "num_votes_imdb" in item_support_df.columns and "movieId" in item_support_df.columns:
        pop_map=dict(zip(item_support_df["movieId"].tolist(),item_support_df["num_votes_imdb"].fillna(0).astype(float).tolist()))
        return pop_map
    if "imdbId" in item_support_df.columns and "num_votes_imdb" in item_support_df.columns:
        try:
            links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
            merged=links.merge(item_support_df[["imdbId","num_votes_imdb"]],on="imdbId",how="left")
            pop_map=dict(zip(merged["movieId"].tolist(),merged["num_votes_imdb"].fillna(0).astype(float).tolist()))
            return pop_map
        except Exception:
            return {}
    return {}

def _bootstrap_ci(values,boot_iters=1000,alpha=0.05,random_state=CFG.get("seed",34)):
    vals=np.array([v for v in values if v is not None])
    if len(vals)==0:
        return None,(None,None)
    rng=np.random.RandomState(random_state)
    n=len(vals)
    idxs=rng.randint(0,n,size=(boot_iters,n))
    samp_means=np.mean(vals[idxs],axis=1)
    mean=float(np.mean(vals))
    lo=float(np.percentile(samp_means,100*(alpha/2)))
    hi=float(np.percentile(samp_means,100*(1-alpha/2)))
    return mean,(lo,hi)
def _compute_slice_recall(predictions,ground_truth,item_support_df,bin_col,ks,min_users=50):
    bin_map=item_support_df.set_index("movieId")[bin_col].to_dict()
    rows=[]

    for b in item_support_df[bin_col].dropna().unique():
        for k in ks:
            vals=[]
            n_users=0
            for u,gt in ground_truth.items():
                if isinstance(gt,dict):
                    gt_items=set(gt.keys())
                else:
                    gt_items=set(gt)
                gt_b={i for i in gt_items if bin_map.get(i)==b}
                if not gt_b:
                    continue
                preds=predictions.get(u,[])
                vals.append(len([i for i in preds[:k] if i in gt_b])/len(gt_b))
                n_users+=1
            rows.append({
                "bin":b,
                "k":k,
                "recall":float(np.mean(vals)) if len(vals)>=min_users else None,
                "n_users_with_gt":n_users
            })
    return pd.DataFrame(rows)


def evaluate_model(predictions,ground_truth,item_support_df,ks=None,boot_iters=1000):
    if ks is None:
        ks=CFG["k_values"]

    users=list(ground_truth.keys())
    eligible_users=[]
    skipped_users=[]

    for u in users:
        gt=ground_truth[u]
        has_pos=any(r>=5.0 for r in gt.values()) if isinstance(gt,dict) else len(gt)>0
        if has_pos:
            eligible_users.append(u)
        else:
            skipped_users.append(u)

    metrics={}
    slices={}

    pop_map=_build_popularity_map(item_support_df)
    pop_vals=[v for v in pop_map.values() if v is not None]
    pop_90=np.percentile(pop_vals,90) if pop_vals else None

    for k in ks:
        ndcgs=[]
        recall5s=[]
        map5s=[]
        mean_pops=[]
        prop_top=[]

        for u in eligible_users:
            preds=predictions.get(u,[])
            gt=ground_truth[u]

            n=ndcg_at_k_graded(preds,gt,k)
            r5=recall_at_k_binary(preds,gt,k,threshold=5.0)
            m5=map_at_k_binary(preds,gt,k,threshold=5.0)

            if n is not None:ndcgs.append(n)
            if r5 is not None:recall5s.append(r5)
            if m5 is not None:map5s.append(m5)

            pops=[pop_map.get(i,0.0) for i in preds[:k]]
            mean_pops.append(np.mean(pops) if pops else 0.0)

            if pop_90 is not None and preds:
                prop_top.append(sum(1 for i in preds[:k] if pop_map.get(i,0.0)>=pop_90)/k)

        mean_ndcg,ci=_bootstrap_ci(ndcgs,boot_iters=boot_iters,random_state=CFG["seed"])

        metrics[f"ndcg@{k}"]=mean_ndcg
        metrics[f"ndcg@{k}_ci_low"]=ci[0]
        metrics[f"ndcg@{k}_ci_high"]=ci[1]
        metrics[f"recall5@{k}"]=float(np.mean(recall5s)) if recall5s else None
        metrics[f"map5@{k}"]=float(np.mean(map5s)) if map5s else None
        metrics[f"mean_popularity_num_votes@{k}"]=float(np.mean(mean_pops)) if mean_pops else None
        metrics[f"prop_top10pct_popularity@{k}"]=float(np.mean(prop_top)) if prop_top else None

    all_pred_items=set()
    for u in eligible_users:
        all_pred_items.update(predictions.get(u,[])[:max(ks)])

    metrics["users_total"]=len(users)
    metrics["users_evaluated"]=len(eligible_users)
    metrics["users_skipped"]=len(skipped_users)
    metrics["items_covered"]=len(all_pred_items)
    metrics["evaluation_catalog_items"]=len(item2idx)
    metrics["catalog_coverage_fraction"]=len(all_pred_items)/len(item2idx)

    slices["by_support"]=_compute_slice_recall(predictions,ground_truth,item_support_df,"support_bin",ks)
    slices["by_imdb_votes"]=_compute_slice_recall(predictions,ground_truth,item_support_df,"imdb_vote_bin",ks)
    slices["by_era"]=_compute_slice_recall(predictions,ground_truth,item_support_df,"era_bin",ks)

    return metrics,slices

# Matrix Factorization (MF)

## Training Method 1

In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(CFG["seed"])
np.random.seed(CFG["seed"])

rating_threshold=CFG["rating_threshold"]
max_epochs=30
patience=3
min_delta=1e-4

train_cf=train[train.rating>=rating_threshold][["userId","movieId"]].copy()

user_ids=train_cf.userId.unique()

links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
cf_movie_set=set(links.movieId.astype(int))

item_ids=train_cf.movieId.unique()
item_ids=[m for m in item_ids if m in cf_movie_set]

item2idx={m:i for i,m in enumerate(item_ids)}
idx2item={i:m for m,i in item2idx.items()}

user2idx={u:i for i,u in enumerate(user_ids)}

train_cf["u"]=train_cf.userId.map(user2idx)
train_cf["i"]=train_cf.movieId.map(item2idx)

item_counts=pd.Series(train_cf.i).value_counts().reindex(range(len(item2idx))).fillna(0).astype(float).values
alpha=0.75
probs=(item_counts**alpha)
probs=probs/probs.sum()

val_users_all=val.userId.unique()
rng=np.random.RandomState(CFG["seed"])
val_users_small=set(rng.choice(val_users_all,size=min(10000,len(val_users_all)),replace=False))

def predict_topk(MFmodel,user_id,k=20):
    if user_id not in user2idx:
        return []
    u=torch.tensor([user2idx[user_id]],device=device)
    i=torch.arange(len(item2idx),device=device)
    with torch.no_grad():
        scores=MFmodel(u.repeat(len(i)),i).cpu().numpy()
    topk_idx=np.argsort(-scores)[:k]
    return [idx2item[j] for j in topk_idx]

class PosDataset(Dataset):
    def __init__(self,df):
        self.u=torch.tensor(df.u.values,dtype=torch.long)
        self.i=torch.tensor(df.i.values,dtype=torch.long)
    def __len__(self):
        return len(self.u)
    def __getitem__(self,idx):
        return self.u[idx],self.i[idx]

ds=PosDataset(train_cf)
loader=DataLoader(ds,batch_size=4096,shuffle=True,drop_last=True)

class MF_BPR(nn.Module):
    def __init__(self,n_users,n_items,k):
        super().__init__()
        self.user_emb=nn.Embedding(n_users,k)
        self.item_emb=nn.Embedding(n_items,k)
        self.user_bias=nn.Embedding(n_users,1)
        self.item_bias=nn.Embedding(n_items,1)
        nn.init.normal_(self.user_emb.weight,std=0.01)
        nn.init.normal_(self.item_emb.weight,std=0.01)
        nn.init.constant_(self.user_bias.weight,0.0)
        nn.init.constant_(self.item_bias.weight,0.0)

    def score(self,u,i):
        ue=self.user_emb(u)
        ie=self.item_emb(i)
        ub=self.user_bias(u).squeeze(-1)
        ib=self.item_bias(i).squeeze(-1)
        return (ue*ie).sum(dim=1)+ub+ib

    def forward(self,u,i):
        return self.score(u,i)

MFmodel=MF_BPR(len(user2idx),len(item2idx),k=32).to(device)
opt=torch.optim.Adam(MFmodel.parameters(),lr=1e-3)

lambda_item=1e-4
lambda_bias=1e-6

best_metric=-np.inf
epochs_no_improve=0
best_state=None

rng=np.random.RandomState(CFG["seed"])

for epoch in range(max_epochs):
    MFmodel.train()
    losses=[]
    for u_batch,i_batch in loader:
        bs=len(u_batch)
        neg_idx=rng.choice(len(item2idx),size=bs,p=probs)
        u=u_batch.to(device)
        pos=i_batch.to(device)
        neg=torch.tensor(neg_idx,dtype=torch.long,device=device)

        pos_score=MFmodel.score(u,pos)
        neg_score=MFmodel.score(u,neg)

        loss_bpr=-F.logsigmoid(pos_score-neg_score).mean()

        pos_ie=MFmodel.item_emb(pos)
        neg_ie=MFmodel.item_emb(neg)
        reg_item=(pos_ie.pow(2).sum()+neg_ie.pow(2).sum())/bs

        pos_b=MFmodel.item_bias(pos).squeeze(-1)
        neg_b=MFmodel.item_bias(neg).squeeze(-1)
        reg_bias=(pos_b.pow(2).sum()+neg_b.pow(2).sum())/bs

        loss=loss_bpr+lambda_item*reg_item+lambda_bias*reg_bias

        opt.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())

    MFmodel.eval()
    with torch.no_grad():
        val_predictions={}
        for uid,g in val.groupby("userId"):
          if uid in val_users_small and uid in user2idx:
              val_predictions[int(uid)]=predict_topk(MFmodel,uid,k=10)

        val_ground_truth={}
        for uid,g in val.groupby("userId"):
            if uid in val_users_small:
                val_ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

        val_metrics,_=evaluate_model(val_predictions,val_ground_truth,item_support_table,ks=[10])
        current_metric=val_metrics["ndcg@10"]
        ci_low=val_metrics.get("ndcg@10_ci_low")
        ci_high=val_metrics.get("ndcg@10_ci_high")

    print(f"epoch={epoch} loss={np.mean(losses):.4f} val_ndcg@10={current_metric:.6f} ci=({ci_low:.6f},{ci_high:.6f})")

    if current_metric>best_metric+min_delta:
        best_metric=current_metric
        best_state={k:v.clone() for k,v in MFmodel.state_dict().items()}
        epochs_no_improve=0
    else:
        epochs_no_improve+=1

    if epochs_no_improve>=patience:
        print("early stopping triggered")
        break

if best_state is not None:
    MFmodel.load_state_dict(best_state)

MFmodel.eval()

epoch=0 loss=0.5706 val_ndcg@10=0.038849 ci=(0.035603,0.042153)
epoch=1 loss=0.4107 val_ndcg@10=0.037460 ci=(0.034208,0.040568)
epoch=2 loss=0.3595 val_ndcg@10=0.038819 ci=(0.035480,0.041864)
epoch=3 loss=0.3298 val_ndcg@10=0.037878 ci=(0.034785,0.041096)
early stopping triggered


MF_BPR(
  (user_emb): Embedding(195767, 32)
  (item_emb): Embedding(41823, 32)
  (user_bias): Embedding(195767, 1)
  (item_bias): Embedding(41823, 1)
)

## Training Method 2

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset,DataLoader

device="cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(CFG["seed"])
np.random.seed(CFG["seed"])

rating_threshold=4.5
max_epochs=30
patience=3
min_delta=1e-4
embedding_dim=32
batch_size=512

train_cf=train[train.rating>=rating_threshold][["userId","movieId"]].copy()

user_ids=train_cf.userId.unique()
item_ids=train_cf.movieId.unique()

user2idx={u:i for i,u in enumerate(user_ids)}
item2idx={m:i for i,m in enumerate(item_ids)}
idx2item={i:m for m,i in item2idx.items()}

train_cf["u"]=train_cf.userId.map(user2idx)
train_cf["i"]=train_cf.movieId.map(item2idx)

item_counts=train_cf.movieId.value_counts().reindex(item_ids).fillna(0).astype(float).values
alpha=0.5
probs=(item_counts**alpha)
probs=probs/probs.sum()

class PosDataset(Dataset):
    def __init__(self,df):
        self.u=torch.tensor(df.u.values,dtype=torch.long)
        self.i=torch.tensor(df.i.values,dtype=torch.long)
    def __len__(self):
        return len(self.u)
    def __getitem__(self,idx):
        return self.u[idx],self.i[idx]

ds=PosDataset(train_cf)
loader=DataLoader(ds,batch_size=batch_size,shuffle=True,drop_last=True)

class MF_BPR(nn.Module):
    def __init__(self,n_users,n_items,k):
        super().__init__()
        self.user_emb=nn.Embedding(n_users,k)
        self.item_emb=nn.Embedding(n_items,k)
        self.user_bias=nn.Embedding(n_users,1)
        self.item_bias=nn.Embedding(n_items,1)
        nn.init.normal_(self.user_emb.weight,std=0.01)
        nn.init.normal_(self.item_emb.weight,std=0.01)
        nn.init.constant_(self.user_bias.weight,0.0)
        nn.init.constant_(self.item_bias.weight,0.0)
    def score(self,u,i):
        ue=self.user_emb(u)
        ie=self.item_emb(i)
        ub=self.user_bias(u).squeeze(-1)
        ib=self.item_bias(i).squeeze(-1)
        return (ue*ie).sum(dim=1)+ub+ib
    def forward(self,u,i):
        return self.score(u,i)

MFmodel=MF_BPR(len(user2idx),len(item2idx),embedding_dim).to(device)
opt=torch.optim.Adam(MFmodel.parameters(),lr=1e-3)

lambda_item=1e-4
lambda_bias=1e-6

best_metric=-np.inf
epochs_no_improve=0
best_state=None

rng=np.random.RandomState(CFG["seed"])

for epoch in range(max_epochs):
    MFmodel.train()
    losses=[]
    for u_batch,i_batch in loader:
        bs=len(u_batch)
        u=u_batch.to(device)
        pos=i_batch.to(device)

        user_e=MFmodel.user_emb(u)
        pos_e=MFmodel.item_emb(pos)
        score_mat=torch.matmul(user_e,pos_e.t())
        score_mat.fill_diagonal_(-1e9)
        inbatch_idx=score_mat.argmax(dim=1)
        neg_inbatch=pos[inbatch_idx]

        rand_idx=rng.choice(len(item2idx),size=bs,p=probs)
        neg_rand=torch.tensor(rand_idx,dtype=torch.long,device=device)

        mix_mask=torch.rand(bs,device=device)<0.2
        neg=torch.where(mix_mask,neg_rand,neg_inbatch)

        pos_score=MFmodel.score(u,pos)
        neg_score=MFmodel.score(u,neg)

        loss_bpr=-F.logsigmoid(pos_score-neg_score).mean()

        reg_item=(MFmodel.item_emb(pos).pow(2).sum()+MFmodel.item_emb(neg).pow(2).sum())/bs
        reg_bias=(MFmodel.item_bias(pos).pow(2).sum()+MFmodel.item_bias(neg).pow(2).sum())/bs

        loss=loss_bpr+lambda_item*reg_item+lambda_bias*reg_bias

        opt.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())

    MFmodel.eval()
    with torch.no_grad():
        val_predictions={}
        for uid,g in val.groupby("userId"):
            if uid in user2idx:
                uidx=user2idx[uid]
                u_t=torch.tensor([uidx],device=device)
                i_t=torch.arange(len(item2idx),device=device)
                scores=MFmodel(u_t.repeat(len(i_t)),i_t).cpu().numpy()
                topk=np.argsort(-scores)[:10]
                val_predictions[int(uid)]=[idx2item[j] for j in topk]

        val_ground_truth={int(uid):{int(r.movieId):float(r.rating) for _,r in g.iterrows()} for uid,g in val.groupby("userId")}

        val_metrics,_=evaluate_model(val_predictions,val_ground_truth,item_support_table,ks=[10])
        current_metric=val_metrics["ndcg@10"]
        ci_low=val_metrics.get("ndcg@10_ci_low")
        ci_high=val_metrics.get("ndcg@10_ci_high")

    print(f"epoch={epoch} loss={np.mean(losses):.4f} val_ndcg@10={current_metric:.6f} ci=({ci_low:.6f},{ci_high:.6f})")

    if ci_low is not None and ci_low>best_metric+min_delta:
        best_metric=current_metric
        best_state={k:v.clone() for k,v in MFmodel.state_dict().items()}
        epochs_no_improve=0
    else:
        epochs_no_improve+=1

    if epochs_no_improve>=patience:
        print("early stopping triggered")
        break

if best_state is not None:
    MFmodel.load_state_dict(best_state)

MFmodel.eval()

epoch=0 loss=0.6089 val_ndcg@10=0.037832 ci=(0.037065,0.038581)
epoch=1 loss=0.5624 val_ndcg@10=0.036924 ci=(0.036180,0.037690)
epoch=2 loss=0.5332 val_ndcg@10=0.036674 ci=(0.035945,0.037442)
epoch=3 loss=0.5119 val_ndcg@10=0.036681 ci=(0.035930,0.037440)
early stopping triggered


MF_BPR(
  (user_emb): Embedding(195767, 32)
  (item_emb): Embedding(41823, 32)
  (user_bias): Embedding(195767, 1)
  (item_bias): Embedding(41823, 1)
)

## Save Model

In [None]:
import wandb

wandb.login()

mf_dir=os.path.join(CFG["artifact_root"],"MF")
os.makedirs(mf_dir,exist_ok=True)

user_emb=MFmodel.user_emb.weight.detach().cpu().numpy()
item_emb=MFmodel.item_emb.weight.detach().cpu().numpy()
user_bias=MFmodel.user_bias.weight.detach().cpu().numpy().reshape(-1)
item_bias=MFmodel.item_bias.weight.detach().cpu().numpy().reshape(-1)

user_index=pd.DataFrame({
    "userId":list(user2idx.keys()),
    "user_idx":list(user2idx.values()),
    "user_bias":user_bias
})

item_index=pd.DataFrame({
    "movieId":list(item2idx.keys()),
    "item_idx":list(item2idx.values()),
    "item_bias":item_bias
})

np.save(os.path.join(mf_dir,"user_embeddings.npy"),user_emb)
np.save(os.path.join(mf_dir,"item_embeddings.npy"),item_emb)
np.save(os.path.join(mf_dir,"user_bias.npy"),user_bias)
np.save(os.path.join(mf_dir,"item_bias.npy"),item_bias)

user_index.to_csv(os.path.join(mf_dir,"user_index.csv"),index=False)
item_index.to_csv(os.path.join(mf_dir,"item_index.csv"),index=False)

model_path=os.path.join(mf_dir,"mf_bpr_model.pt")
torch.save(MFmodel.state_dict(),model_path)

mf_metadata={
    "model_type":"MF_BPR",
    "embedding_dim":int(user_emb.shape[1]),
    "n_users":int(user_emb.shape[0]),
    "n_items":int(item_emb.shape[0]),
    "rating_threshold":CFG["rating_threshold"],
    "loss":"BPR",
    "negative_sampling":"popularity_aware",
    "regularization":{"item":1e-4,"bias":1e-6},
    "early_stopping":{"patience":3,"min_delta":1e-4},
    "seed":CFG["seed"]
}

with open(os.path.join(mf_dir,"mf_metadata.json"),"w") as f:
    json.dump(mf_metadata,f,indent=2)

run=wandb.init(project="JustiFlicks",job_type="model",name="02_MF_BPR_baseline",reinit=True)

artifact_dir = os.path.join(run.dir, "MF")
os.makedirs(artifact_dir, exist_ok=True)

np.save(os.path.join(artifact_dir,"user_embeddings.npy"), user_emb)
np.save(os.path.join(artifact_dir,"item_embeddings.npy"), item_emb)
np.save(os.path.join(artifact_dir,"user_bias.npy"), user_bias)
np.save(os.path.join(artifact_dir,"item_bias.npy"), item_bias)

user_index.to_csv(os.path.join(artifact_dir,"user_index.csv"), index=False)
item_index.to_csv(os.path.join(artifact_dir,"item_index.csv"), index=False)

torch.save(MFmodel.state_dict(), os.path.join(artifact_dir,"mf_bpr_model.pt"))

with open(os.path.join(artifact_dir,"mf_metadata.json"),"w") as f:
    json.dump(mf_metadata, f, indent=2)

artifact = wandb.Artifact(
    name="MF_BPR_baseline",
    type="model",
    metadata=mf_metadata
)

artifact.add_dir(artifact_dir)
run.log_artifact(artifact)

run.finish()

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20260103_233838-6fn7wm59/files/MF)... Done. 0.2s


## Eval

In [None]:
mf_dir=os.path.join(CFG["artifact_root"],"MF")

item_index=pd.read_csv(os.path.join(CFG["artifact_root"],"MF","item_index.csv"))
idx2movie=dict(zip(item_index.item_idx,item_index.movieId))

splits_item_support_path=os.path.join(CFG["artifact_root"],"splits","item_support_table.parquet")
item_support_table=pd.read_parquet(splits_item_support_path)

test_users=test.userId.unique().tolist()

def predict_topk_movieids(model,user_id,k=20):
    try:
        return predict_topk(model,user_id,k=k)
    except Exception:
        return []


ground_truth={}
for uid,g in test.groupby("userId"):
    gt_dict={int(row.movieId):float(row.rating) for _,row in g.iterrows()}
    ground_truth[int(uid)]=gt_dict

predictions={}
for u in ground_truth.keys():
    if u in user2idx:
        preds=predict_topk_movieids(MFmodel,u,k=max(CFG["k_values"]))
        predictions[u]=preds
    else:
        predictions[u]=[]

metrics,slices=evaluate_model(predictions,ground_truth,item_support_table,ks=CFG["k_values"])

print("METRICS")
print(json.dumps(metrics,indent=2))

METRICS
{
  "ndcg@5": 0.022471047626556325,
  "ndcg@5_ci_low": 0.02187858406977611,
  "ndcg@5_ci_high": 0.023090926884242812,
  "recall5@5": 0.03091017241941267,
  "map5@5": 0.014779095096856903,
  "mean_popularity_num_votes@5": 1427231.0641048204,
  "prop_top10pct_popularity@5": 1.0,
  "ndcg@10": 0.03340271862547721,
  "ndcg@10_ci_low": 0.03278057277749771,
  "ndcg@10_ci_high": 0.03411341629272837,
  "recall5@10": 0.05868126853753137,
  "map5@10": 0.01881639191154445,
  "mean_popularity_num_votes@10": 1315800.106584531,
  "prop_top10pct_popularity@10": 0.9999990071288151,
  "ndcg@20": 0.04698636839831592,
  "ndcg@20_ci_low": 0.046285619789340586,
  "ndcg@20_ci_high": 0.047724457883555144,
  "recall5@20": 0.10206349206349206,
  "map5@20": 0.022287570427294148,
  "mean_popularity_num_votes@20": 1171725.9244499854,
  "prop_top10pct_popularity@20": 0.9999965249508528,
  "users_total": 200948,
  "users_evaluated": 102270,
  "users_skipped": 98678,
  "items_covered": 790,
  "evaluation_cata

In [None]:
print("\nSLICES BY SUPPORT")
display(slices["by_support"])

print("\nSLICES BY IMDb VOTES")
display(slices["by_imdb_votes"])

print("\nSLICES BY ERA")
display(slices["by_era"])

mf_dir=os.path.join(CFG["artifact_root"],"MF")
os.makedirs(mf_dir,exist_ok=True)

metrics_path=os.path.join(mf_dir,"metrics.json")
with open(metrics_path,"w") as f:
    json.dump(metrics,f,indent=2)

slices_long=pd.concat(
    [df.assign(slice_type=name) for name,df in slices.items()],
    ignore_index=True
)

slices_path=os.path.join(mf_dir,"slices_long.csv")
slices_long.to_csv(slices_path,index=False)




SLICES BY SUPPORT


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1-4,5,0.0,3284
1,1-4,10,0.0,3284
2,1-4,20,0.0,3284
3,0,5,0.0,1432
4,0,10,0.0,1432
5,0,20,0.0,1432
6,5-19,5,0.0,7233
7,5-19,10,0.0,7233
8,5-19,20,0.0,7233
9,20+,5,0.014472,200734



SLICES BY IMDb VOTES


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1000+,5,0.014412,200923
1,1000+,10,0.028519,200923
2,1000+,20,0.053767,200923
3,100-999,5,0.0,3483
4,100-999,10,0.0,3483
5,100-999,20,0.0,3483
6,10-99,5,0.0,408
7,10-99,10,0.0,408
8,10-99,20,0.0,408
9,1-9,5,,1



SLICES BY ERA


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1900-1969,5,0.009612,36193
1,1900-1969,10,0.019609,36193
2,1900-1969,20,0.037152,36193
3,2020-2029,5,0.0,9059
4,2020-2029,10,0.0,9059
5,2020-2029,20,0.00011,9059
6,1990-1999,5,0.025077,137335
7,1990-1999,10,0.044835,137335
8,1990-1999,20,0.077842,137335
9,2010-2019,5,0.011147,63228


In [None]:
links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
ratings_all=pd.read_parquet(os.path.join(CFG["data_root"],"movieRatings.parquet"))

print("unique in movieLinks:",links.movieId.nunique())
print("unique in ratings (all):",ratings_all.movieId.nunique())
print("unique in train (full):",train.movieId.nunique())
print("unique in train_cf (rating>=threshold):",train_cf.movieId.nunique())
print("intersection train_cf ∩ links:",len(set(train_cf.movieId.unique()) & set(links.movieId.astype(int))))
ratings_all2 = ratings_all[ratings_all.rating >= rating_threshold][["userId", "movieId"]].copy()
print("unique in ratings (all):",ratings_all2.movieId.nunique())

unique in movieLinks: 87585
unique in ratings (all): 84432
unique in train (full): 83621
unique in train_cf (rating>=threshold): 41823
intersection train_cf ∩ links: 41823
unique in ratings (all): 42721


## Save Metrics

In [None]:
mf_dir=os.path.join(CFG["artifact_root"],"MF")
metrics_path=os.path.join(mf_dir,"metrics.json")
slices_path=os.path.join(mf_dir,"slices_long.csv")

with open(metrics_path) as f:
    metrics=json.load(f)

run=wandb.init(
    project="JustiFlicks",
    job_type="eval",
    name="02_MF_BPR_evaluation",
    reinit=True
)

run.log(metrics)

eval_artifact=wandb.Artifact(
    name="MF_BPR_evaluation",
    type="evaluation",
    metadata={
        "model":"MF_BPR_baseline",
        "catalog_restricted":True,
        "evaluation_users":"full" if metrics["users_total"]>50000 else "subsample",
        "primary_metric":"ndcg@10"
    }
)

eval_artifact.add_file(metrics_path)
eval_artifact.add_file(slices_path)

run.log_artifact(eval_artifact)
run.finish()

0,1
catalog_coverage_fraction,▁
evaluation_catalog_items,▁
items_covered,▁
map5@10,▁
map5@20,▁
map5@5,▁
mean_popularity_num_votes@10,▁
mean_popularity_num_votes@20,▁
mean_popularity_num_votes@5,▁
ndcg@10,▁

0,1
catalog_coverage_fraction,0.01889
evaluation_catalog_items,41823
items_covered,790
map5@10,0.01882
map5@20,0.02229
map5@5,0.01478
mean_popularity_num_votes@10,1315800.10658
mean_popularity_num_votes@20,1171725.92445
mean_popularity_num_votes@5,1427231.0641
ndcg@10,0.0334


# Implicit BPR

## Model Training

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from implicit.bpr import BayesianPersonalizedRanking

np.random.seed(CFG["seed"])

rating_threshold=CFG["rating_threshold"]
max_epochs=30
patience=3
min_delta=1e-4

train_cf=train[train.rating>=rating_threshold][["userId","movieId"]].copy()

links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
cf_movie_set=set(links.movieId.astype(int))

train_cf=train_cf[train_cf.movieId.isin(cf_movie_set)]

user_ids=train_cf.userId.unique()
item_ids=train_cf.movieId.unique()

user2idx={u:i for i,u in enumerate(user_ids)}
item2idx={m:i for i,m in enumerate(item_ids)}
idx2item={i:m for m,i in item2idx.items()}

train_cf["u"]=train_cf.userId.map(user2idx)
train_cf["i"]=train_cf.movieId.map(item2idx)

n_users=len(user2idx)
n_items=len(item2idx)

rows=train_cf.u.values
cols=train_cf.i.values
data=np.ones(len(train_cf),dtype=np.float32)

user_item=sp.coo_matrix(
    (data,(rows,cols)),
    shape=(n_users,n_items)
).tocsr()

item_counts=np.asarray(user_item.sum(axis=0)).ravel()
alpha=0.75
pop_weights=(item_counts**alpha)
pop_weights=pop_weights/pop_weights.sum()

model=BayesianPersonalizedRanking(
    factors=64,
    learning_rate=0.01,
    regularization=1e-4,
    iterations=1,
    random_state=CFG["seed"],
    verify_negative_samples=True
)

In [None]:
rng=np.random.RandomState(CFG["seed"])

val_users_all=val.userId.unique()
val_users_small=set(
    rng.choice(
        val_users_all,
        size=min(10000,len(val_users_all)),
        replace=False
    )
)

def predict_topk_implicit(model,user_id,k=10):
    if user_id not in user2idx:
        return []
    uidx=user2idx[user_id]
    ids,scores=model.recommend(
        uidx,
        user_item,
        N=k,
        filter_already_liked_items=False
    )
    return [idx2item[i] for i in ids]

best_metric=-np.inf
epochs_no_improve=0
best_state=None

for epoch in range(max_epochs):
    model.fit(
        user_item,
        show_progress=False,
    )

    val_predictions={}
    for uid,g in val.groupby("userId"):
        if uid in val_users_small and uid in user2idx:
            val_predictions[int(uid)]=predict_topk_implicit(model,int(uid),k=10)

    val_ground_truth={}
    for uid,g in val.groupby("userId"):
        if uid in val_users_small:
            val_ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

    val_metrics,_=evaluate_model(
        val_predictions,
        val_ground_truth,
        item_support_table,
        ks=[10]
    )

    current_metric=val_metrics["ndcg@10"]
    ci_low=val_metrics.get("ndcg@10_ci_low")
    ci_high=val_metrics.get("ndcg@10_ci_high")

    print(
        f"epoch={epoch} "
        f"val_ndcg@10={current_metric:.6f} "
        f"ci=({ci_low:.6f},{ci_high:.6f})"
    )

    if current_metric>best_metric+min_delta:
        best_metric=current_metric
        best_state=model.to_cpu()
        epochs_no_improve=0
    else:
        epochs_no_improve+=1


    if epochs_no_improve>=patience:
        print("early stopping triggered")
        break

if best_state is not None:
    model=best_state

epoch=0 val_ndcg@10=0.033343 ci=(0.030052,0.036725)
epoch=1 val_ndcg@10=0.034816 ci=(0.031633,0.038245)
epoch=2 val_ndcg@10=0.034571 ci=(0.031211,0.038113)
epoch=3 val_ndcg@10=0.035184 ci=(0.032198,0.038312)
epoch=4 val_ndcg@10=0.036771 ci=(0.033562,0.040096)
epoch=5 val_ndcg@10=0.037676 ci=(0.034539,0.041033)
epoch=6 val_ndcg@10=0.037212 ci=(0.033996,0.040393)
epoch=7 val_ndcg@10=0.038035 ci=(0.034929,0.041329)
epoch=8 val_ndcg@10=0.038089 ci=(0.034845,0.041213)
epoch=9 val_ndcg@10=0.037850 ci=(0.034628,0.040965)
epoch=10 val_ndcg@10=0.037088 ci=(0.033998,0.040161)
early stopping triggered


## Eval

In [None]:
implicit_dir=os.path.join(CFG["artifact_root"],"implicitBPR")
os.makedirs(implicit_dir,exist_ok=True)

test_users=test.userId.unique().tolist()

def predict_topk_movieids_implicit(model,user_id,k=20):
    if user_id not in user2idx:
        return []
    uidx=user2idx[user_id]
    ids,_=model.recommend(
        uidx,
        user_item,
        N=k,
        filter_already_liked_items=False
    )
    return [idx2item[i] for i in ids]

ground_truth={}
for uid,g in test.groupby("userId"):
    ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

predictions={}
for u in ground_truth.keys():
    if u in user2idx:
        predictions[u]=predict_topk_movieids_implicit(
            model,
            u,
            k=max(CFG["k_values"])
        )
    else:
        predictions[u]=[]

metrics,slices=evaluate_model(
    predictions,
    ground_truth,
    item_support_table,
    ks=CFG["k_values"]
)

print("METRICS")
print(json.dumps(metrics,indent=2))

METRICS
{
  "ndcg@5": 0.021189998159728433,
  "ndcg@5_ci_low": 0.02063573573966044,
  "ndcg@5_ci_high": 0.02179820769806488,
  "recall5@5": 0.030079528046673836,
  "map5@5": 0.014090254011711918,
  "mean_popularity_num_votes@5": 1467336.1648244844,
  "prop_top10pct_popularity@5": 0.9978236263627157,
  "ndcg@10": 0.031405298833961245,
  "ndcg@10_ci_low": 0.03079619287881412,
  "ndcg@10_ci_high": 0.032063606823307694,
  "recall5@10": 0.056125126299664284,
  "map5@10": 0.017870355043682996,
  "mean_popularity_num_votes@10": 1270632.6061190967,
  "prop_top10pct_popularity@10": 0.9966043805476678,
  "ndcg@20": 0.04422105581099476,
  "ndcg@20_ci_low": 0.04358989499883233,
  "ndcg@20_ci_high": 0.04494983998587486,
  "recall5@20": 0.09678351422704605,
  "map5@20": 0.021046977535515706,
  "mean_popularity_num_votes@20": 1070915.1977525179,
  "prop_top10pct_popularity@20": 0.9944602752238925,
  "users_total": 200948,
  "users_evaluated": 102270,
  "users_skipped": 98678,
  "items_covered": 1756,

In [None]:
print("\nSLICES BY SUPPORT")
display(slices["by_support"])

print("\nSLICES BY IMDb VOTES")
display(slices["by_imdb_votes"])

print("\nSLICES BY ERA")
display(slices["by_era"])

metrics_path=os.path.join(implicit_dir,"metrics.json")
with open(metrics_path,"w") as f:
    json.dump(metrics,f,indent=2)

slices_long=pd.concat(
    [df.assign(slice_type=name) for name,df in slices.items()],
    ignore_index=True
)

slices_path=os.path.join(implicit_dir,"slices_long.csv")
slices_long.to_csv(slices_path,index=False)


SLICES BY SUPPORT


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1-4,5,0.0,3284
1,1-4,10,0.0,3284
2,1-4,20,0.0,3284
3,0,5,0.0,1432
4,0,10,0.0,1432
5,0,20,0.0,1432
6,5-19,5,0.0,7233
7,5-19,10,0.0,7233
8,5-19,20,0.0,7233
9,20+,5,0.013379,200734



SLICES BY IMDb VOTES


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1000+,5,0.01333,200923
1,1000+,10,0.026558,200923
2,1000+,20,0.050306,200923
3,100-999,5,0.0,3483
4,100-999,10,0.0,3483
5,100-999,20,0.0,3483
6,10-99,5,0.0,408
7,10-99,10,0.0,408
8,10-99,20,0.0,408
9,1-9,5,,1



SLICES BY ERA


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1900-1969,5,0.006579,36193
1,1900-1969,10,0.014518,36193
2,1900-1969,20,0.031405,36193
3,2020-2029,5,0.0,9059
4,2020-2029,10,0.0,9059
5,2020-2029,20,0.000221,9059
6,1990-1999,5,0.023959,137335
7,1990-1999,10,0.043656,137335
8,1990-1999,20,0.075536,137335
9,2010-2019,5,0.007024,63228


## Save

In [None]:
import wandb

wandb.login()

implicit_dir=os.path.join(CFG["artifact_root"],"implicitBPR")
os.makedirs(implicit_dir,exist_ok=True)

if hasattr(model,"to_cpu"):
    model=model.to_cpu()

user_emb=model.user_factors
item_emb=model.item_factors

user_index=pd.DataFrame({
    "userId":list(user2idx.keys()),
    "user_idx":list(user2idx.values())
})

item_index=pd.DataFrame({
    "movieId":list(item2idx.keys()),
    "item_idx":list(item2idx.values())
})

np.save(os.path.join(implicit_dir,"user_embeddings.npy"),user_emb)
np.save(os.path.join(implicit_dir,"item_embeddings.npy"),item_emb)

user_index.to_csv(os.path.join(implicit_dir,"user_index.csv"),index=False)
item_index.to_csv(os.path.join(implicit_dir,"item_index.csv"),index=False)

model_path=os.path.join(implicit_dir,"implicit_bpr_model.npz")
model.save(model_path)

implicit_metadata={
    "model_type":"implicit_BPR",
    "embedding_dim":int(user_emb.shape[1]),
    "n_users":int(user_emb.shape[0]),
    "n_items":int(item_emb.shape[0]),
    "rating_threshold":CFG["rating_threshold"],
    "loss":"BPR",
    "negative_sampling":"implicit_internal",
    "regularization":"implicit_default",
    "early_stopping":{"patience":3,"min_delta":1e-4},
    "seed":CFG["seed"]
}

with open(os.path.join(implicit_dir,"implicit_metadata.json"),"w") as f:
    json.dump(implicit_metadata,f,indent=2)

run=wandb.init(
    project="JustiFlicks",
    job_type="model",
    name="03_implicit_BPR_baseline",
    reinit=True
)

artifact_dir=os.path.join(run.dir,"implicit_bpr")
os.makedirs(artifact_dir,exist_ok=True)

np.save(os.path.join(artifact_dir,"user_embeddings.npy"),user_emb)
np.save(os.path.join(artifact_dir,"item_embeddings.npy"),item_emb)

user_index.to_csv(os.path.join(artifact_dir,"user_index.csv"),index=False)
item_index.to_csv(os.path.join(artifact_dir,"item_index.csv"),index=False)

model.save(os.path.join(artifact_dir,"implicit_bpr_model.npz"))

with open(os.path.join(artifact_dir,"implicit_metadata.json"),"w") as f:
    json.dump(implicit_metadata,f,indent=2)

artifact=wandb.Artifact(
    name="implicit_BPR_baseline",
    type="model",
    metadata=implicit_metadata
)

artifact.add_dir(artifact_dir)
run.log_artifact(artifact)
run.finish()



wandb: Adding directory to artifact (/content/wandb/run-20260102_133145-jbk1a0ei/files/implicit_bpr)... Done. 0.5s


In [None]:
implicit_dir=os.path.join(CFG["artifact_root"],"implicitBPR")
metrics_path=os.path.join(implicit_dir,"metrics.json")
slices_path=os.path.join(implicit_dir,"slices_long.csv")

with open(metrics_path) as f:
    metrics=json.load(f)

run=wandb.init(
    project="JustiFlicks",
    job_type="eval",
    name="03_implicit_BPR_evaluation",
    reinit=True
)

run.log(metrics)

eval_artifact=wandb.Artifact(
    name="implicit_BPR_evaluation",
    type="evaluation",
    metadata={
        "model":"implicit_BPR_baseline",
        "catalog_restricted":True,
        "evaluation_users":"full" if metrics["users_total"]>50000 else "subsample",
        "primary_metric":"ndcg@10"
    }
)

eval_artifact.add_file(metrics_path)
eval_artifact.add_file(slices_path)

run.log_artifact(eval_artifact)
run.finish()

0,1
catalog_coverage_fraction,▁
evaluation_catalog_items,▁
items_covered,▁
map5@10,▁
map5@20,▁
map5@5,▁
mean_popularity_num_votes@10,▁
mean_popularity_num_votes@20,▁
mean_popularity_num_votes@5,▁
ndcg@10,▁

0,1
catalog_coverage_fraction,0.04199
evaluation_catalog_items,41823
items_covered,1756
map5@10,0.01787
map5@20,0.02105
map5@5,0.01409
mean_popularity_num_votes@10,1270632.60612
mean_popularity_num_votes@20,1070915.19775
mean_popularity_num_votes@5,1467336.16482
ndcg@10,0.03141


# Content Only (Two Tower)

## Model Training

In [None]:
import pandas as pd


movieRatings=pd.read_parquet(os.path.join(CFG["data_root"],"movieRatings.parquet"))
movieData=pd.read_parquet(os.path.join(CFG["data_root"],"movieData.parquet"))
movieLinks=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))

df=movieLinks.merge(movieData[["imdbId","original_language","genres"]],on="imdbId",how="right")

df["original_language"]=df["original_language"].fillna("unknown").str.lower()

genre_dummies=pd.DataFrame(
    df["genres"].explode().str.lower().pipe(pd.get_dummies)
).groupby(level=0).sum()

lang_dummies=pd.get_dummies(df["original_language"],prefix="lang")

item_features=pd.concat(
    [df[["movieId","imdbId"]].reset_index(drop=True),
     lang_dummies.reset_index(drop=True),
     genre_dummies.reset_index(drop=True)],
    axis=1
)

item_features_path=os.path.join(CFG["embedding_root"],"item_features.parquet")
item_features.to_parquet(item_features_path,index=False)

In [None]:
SEED=CFG.get("seed",42)
np.random.seed(SEED)
torch.manual_seed(SEED)
device="cuda" if torch.cuda.is_available() else "cpu"

item_support_table=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/item_support_table.parquet"))
item_features=pd.read_parquet(os.path.join(CFG["embedding_root"],"item_features.parquet"))
train=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train.parquet"))
val=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val.parquet"))

rating_threshold=CFG["rating_threshold"]

train_cf=train[train.rating>=rating_threshold][["userId","movieId"]].copy()
user_ids=train_cf.userId.unique()

links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
cf_movie_set=set(links.movieId.astype(int))

item_ids=train_cf.movieId.unique()
item_ids=[int(m) for m in item_ids if int(m) in cf_movie_set]

item2idx={m:i for i,m in enumerate(item_ids)}
idx2item={i:m for m,i in item2idx.items()}

items_df=item_features[item_features.movieId.isin(item2idx.keys())].copy()
items_df["movieId"]=items_df.movieId.astype(int)

items_df=items_df.merge(
    item_support_table[["movieId","support","num_votes_imdb","release_year"]],
    on="movieId",
    how="left"
)

numeric_cols=["support","num_votes_imdb","release_year"]
for c in numeric_cols:
    items_df[c]=items_df[c].fillna(0).astype(np.float32)
    if c in ("support","num_votes_imdb"):
        items_df[c]=np.log1p(items_df[c])
    denom=items_df[c].max()-items_df[c].min()
    if denom>0:
        items_df[c]=(items_df[c]-items_df[c].min())/denom

exclude={"movieId","imdbId","support","num_votes_imdb","release_year"}
onehot_cols=[c for c in items_df.columns if c not in exclude]

feat_dim=len(onehot_cols)+len(numeric_cols)
X_item=np.zeros((len(item2idx),feat_dim),dtype=np.float32)

for _,row in items_df.iterrows():
    i=item2idx[row.movieId]
    X_item[i,:len(onehot_cols)]=row[onehot_cols].astype(np.float32).values
    X_item[i,len(onehot_cols):]=row[numeric_cols].astype(np.float32).values

print(X_item.shape)
print(X_item.nbytes/1024/1024)

(41823, 87)
13.88016128540039


In [None]:

rating_threshold=CFG.get("rating_threshold",4.5)
train_pos=train[train.rating>=rating_threshold][["userId","movieId","datetime"]].copy()

train_pos=train_pos[train_pos.movieId.isin(item2idx.keys())].copy()
train_pos["item_idx"]=train_pos.movieId.map(item2idx)

user_ids=train_pos.userId.unique().tolist()
user2idx={u:i for i,u in enumerate(user_ids)}
idx2user={i:u for u,i in user2idx.items()}

train_pos["u"]=train_pos.userId.map(user2idx)

user_profiles=np.zeros((len(user2idx),X_item.shape[1]),dtype=np.float32)
user_counts=np.zeros(len(user2idx),dtype=np.int32)
for u,g in train_pos.groupby("u"):
    idxs=g.item_idx.values
    if len(idxs)>0:
        user_profiles[u]=X_item[idxs].mean(axis=0)
        user_counts[u]=len(idxs)

global_mean=user_profiles.sum(axis=0)
nonzero_mask=user_counts>0
if nonzero_mask.sum()>0:
    global_mean= user_profiles[nonzero_mask].mean(axis=0)
else:
    global_mean=np.zeros(X_item.shape[1],dtype=np.float32)
for u in range(len(user2idx)):
    if user_counts[u]==0:
        user_profiles[u]=global_mean

print("users:",user_profiles.shape)

users: (195767, 87)


In [None]:
user_pos_list=defaultdict(list)
for _,row in train_pos.iterrows():
    user_pos_list[int(row.u)].append(int(row.item_idx))

pairs=[]
for u,items in user_pos_list.items():
    for it in items:
        pairs.append((u,it))
pairs=np.array(pairs,dtype=int)

class PosPairDataset(data.Dataset):
    def __init__(self,pairs):
        self.pairs=pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self,idx):
        return int(self.pairs[idx,0]),int(self.pairs[idx,1])

batch_size=4096
dataset=PosPairDataset(pairs)
loader=data.DataLoader(dataset,batch_size=batch_size,shuffle=True,drop_last=True,num_workers=2,pin_memory=True)

In [None]:
class TwoTower(nn.Module):
    def __init__(self,input_dim,hidden_dims=[256,128],emb_dim=64,drop=0.2):
        super().__init__()
        def make_mlp(in_dim, layers):
            seq=[]
            d=in_dim
            for h in layers:
                seq.append(nn.Linear(d,h)); seq.append(nn.ReLU()); seq.append(nn.Dropout(drop))
                d=h
            seq.append(nn.Linear(d,emb_dim))
            return nn.Sequential(*seq)
        self.item_net=make_mlp(input_dim,hidden_dims)
        self.user_net=make_mlp(input_dim,hidden_dims)
    def forward_user(self,user_feat):
        u=self.user_net(user_feat)
        return F.normalize(u,p=2,dim=1)
    def forward_item(self,item_feat):
        i=self.item_net(item_feat)
        return F.normalize(i,p=2,dim=1)

input_dim=X_item.shape[1]
model=TwoTower(input_dim=input_dim,hidden_dims=[256,128],emb_dim=64,drop=0.2).to(device)
opt=torch.optim.Adam(model.parameters(),lr=1e-3,weight_decay=1e-5)


In [None]:
def info_nce_loss(u_emb, pos_i_emb, temp=0.07):
    logits = torch.matmul(u_emb, pos_i_emb.T) / temp
    labels = torch.arange(logits.size(0),device=logits.device)
    loss = F.cross_entropy(logits, labels)
    return loss

item_feat_tensor = torch.tensor(X_item,device=device)

rng=np.random.RandomState(SEED)
val_users_all=val.userId.unique()
val_users_small=set(rng.choice(val_users_all,size=min(10000,len(val_users_all)),replace=False))

max_epochs=100
patience=30
min_delta=1e-4

best_metric=-np.inf
epochs_no_improve=0
best_state=None

for epoch in range(max_epochs):
    model.train()
    losses=[]
    model=model.to(device)

    for u_batch,i_batch in loader:
        u_batch=u_batch.to(device)
        i_batch=i_batch.to(device)

        batch_user_feats=torch.from_numpy(
            user_profiles[u_batch.cpu().numpy()]
        ).to(device)

        batch_item_feats=item_feat_tensor[i_batch]

        user_embs=model.forward_user(batch_user_feats)
        pos_item_embs=model.forward_item(batch_item_feats)

        loss=info_nce_loss(user_embs,pos_item_embs,temp=0.07)

        opt.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        item_embs = model.forward_item(item_feat_tensor).cpu().numpy()

        val_user_list = [u for u in val_users_small if u in user2idx]
        val_user_idx=[user2idx[u] for u in val_user_list]
        val_user_feats = torch.tensor(user_profiles[val_user_idx],dtype=torch.float32,device=device)
        val_user_embs = model.forward_user(val_user_feats).cpu().numpy()

        val_predictions={}
        for idx,uid in enumerate(val_user_list):
            scores = val_user_embs[idx] @ item_embs.T
            topk = np.argsort(-scores)[:max(CFG["k_values"])]
            val_predictions[int(uid)]=[idx2item[int(t)] for t in topk]

        val_ground_truth={}
        for uid,g in val.groupby("userId"):
            if uid in val_users_small:
                val_ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}
        val_metrics,_=evaluate_model(val_predictions,val_ground_truth,item_support_table,ks=[10])
        current_metric=val_metrics["ndcg@10"]
        ci_low=val_metrics.get("ndcg@10_ci_low"); ci_high=val_metrics.get("ndcg@10_ci_high")
    print(f"epoch={epoch} loss={np.mean(losses):.4f} val_ndcg@10={current_metric:.6f} ci=({ci_low:.6f},{ci_high:.6f})")
    if current_metric>best_metric+min_delta:
        best_metric=current_metric
        best_state={k:v.cpu().state_dict() for k,v in {"model":model}.items()}
        epochs_no_improve=0
    else:
        epochs_no_improve+=1
    if epochs_no_improve>=patience:
        print("early stopping triggered"); break

if best_state is not None:
    model.load_state_dict(best_state["model"])


epoch=0 loss=7.4007 val_ndcg@10=0.020008 ci=(0.017591,0.022362)
epoch=1 loss=7.3896 val_ndcg@10=0.020923 ci=(0.018559,0.023308)
epoch=2 loss=7.3822 val_ndcg@10=0.020238 ci=(0.017957,0.022451)
epoch=3 loss=7.3754 val_ndcg@10=0.020643 ci=(0.018338,0.022781)
epoch=4 loss=7.3695 val_ndcg@10=0.021066 ci=(0.018705,0.023362)
epoch=5 loss=7.3640 val_ndcg@10=0.021680 ci=(0.019258,0.024014)
epoch=6 loss=7.3595 val_ndcg@10=0.020806 ci=(0.018618,0.023130)
epoch=7 loss=7.3547 val_ndcg@10=0.021217 ci=(0.018756,0.023615)
epoch=8 loss=7.3505 val_ndcg@10=0.019414 ci=(0.017228,0.021655)
epoch=9 loss=7.3471 val_ndcg@10=0.020519 ci=(0.018079,0.022977)
epoch=10 loss=7.3448 val_ndcg@10=0.020217 ci=(0.017770,0.022520)
epoch=11 loss=7.3409 val_ndcg@10=0.019675 ci=(0.017409,0.021902)
epoch=12 loss=7.3371 val_ndcg@10=0.020202 ci=(0.017949,0.022504)
epoch=13 loss=7.3344 val_ndcg@10=0.021594 ci=(0.019138,0.023888)
epoch=14 loss=7.3322 val_ndcg@10=0.021853 ci=(0.019306,0.024156)
epoch=15 loss=7.3303 val_ndcg@10=0.

## Eval

In [None]:
model.eval()
model=model.to(device)

def predict_topk_two_tower(user_id,k):
    if user_id not in user2idx:
        return []
    uidx=user2idx[user_id]
    with torch.no_grad():
        u_feat=torch.from_numpy(user_profiles[uidx:uidx+1]).to(device)
        u_emb=model.forward_user(u_feat)
        scores=(u_emb @ item_embs_tensor.T).squeeze(0).cpu().numpy()
    topk_idx=np.argsort(-scores)[:k]
    return [idx2item[int(i)] for i in topk_idx]

item_embs_tensor=torch.from_numpy(item_embs).to(device)

ground_truth={}
for uid,g in test.groupby("userId"):
    ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

predictions={}
for u in ground_truth.keys():
    predictions[u]=predict_topk_two_tower(u,k=max(CFG["k_values"])) if u in user2idx else []

metrics,slices=evaluate_model(
    predictions,
    ground_truth,
    item_support_table,
    ks=CFG["k_values"]
)

print("METRICS")
print(json.dumps(metrics,indent=2))

METRICS
{
  "ndcg@5": 0.009200379300145648,
  "ndcg@5_ci_low": 0.00883252523996201,
  "ndcg@5_ci_high": 0.00958378275127427,
  "recall5@5": 0.013321925621720283,
  "map5@5": 0.005614098845105006,
  "mean_popularity_num_votes@5": 369426.15194289625,
  "prop_top10pct_popularity@5": 0.8537063881332037,
  "ndcg@10": 0.015038177742063698,
  "ndcg@10_ci_low": 0.014594762256961977,
  "ndcg@10_ci_high": 0.015483492662809456,
  "recall5@10": 0.028100290081809583,
  "map5@10": 0.007647302981577451,
  "mean_popularity_num_votes@10": 360082.43380952376,
  "prop_top10pct_popularity@10": 0.840054409340932,
  "ndcg@20": 0.023354272351723583,
  "ndcg@20_ci_low": 0.022854834463304524,
  "ndcg@20_ci_high": 0.02383615479546472,
  "recall5@20": 0.055095009941005836,
  "map5@20": 0.009646035473892403,
  "mean_popularity_num_votes@20": 343316.83084775595,
  "prop_top10pct_popularity@20": 0.8234948072837032,
  "users_total": 200948,
  "users_evaluated": 102270,
  "users_skipped": 98678,
  "items_covered": 26

In [None]:
two_dir=os.path.join(CFG["artifact_root"],"twoTower")
os.makedirs(two_dir,exist_ok=True)

with open(os.path.join(two_dir,"metrics.json"),"w") as f:
    json.dump(metrics,f,indent=2)

slices_long=pd.concat(
    [df.assign(slice_type=name) for name,df in slices.items()],
    ignore_index=True
)

slices_long.to_csv(os.path.join(two_dir,"slices_long.csv"),index=False)

print("\nSLICES BY SUPPORT")
display(slices["by_support"])

print("\nSLICES BY IMDb VOTES")
display(slices["by_imdb_votes"])

print("\nSLICES BY ERA")
display(slices["by_era"])


SLICES BY SUPPORT


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1-4,5,0.000152,3284
1,1-4,10,0.000609,3284
2,1-4,20,0.003299,3284
3,0,5,0.0,1432
4,0,10,0.0,1432
5,0,20,0.0,1432
6,5-19,5,0.00136,7233
7,5-19,10,0.002788,7233
8,5-19,20,0.005715,7233
9,20+,5,0.007414,200734



SLICES BY IMDb VOTES


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1000+,5,0.007369,200923
1,1000+,10,0.01587,200923
2,1000+,20,0.032585,200923
3,100-999,5,0.001723,3483
4,100-999,10,0.004881,3483
5,100-999,20,0.00957,3483
6,10-99,5,0.004902,408
7,10-99,10,0.017157,408
8,10-99,20,0.025735,408
9,1-9,5,,1



SLICES BY ERA


Unnamed: 0,bin,k,recall,n_users_with_gt
0,1900-1969,5,0.00594,36193
1,1900-1969,10,0.01165,36193
2,1900-1969,20,0.02303,36193
3,2020-2029,5,0.002524,9059
4,2020-2029,10,0.005317,9059
5,2020-2029,20,0.011889,9059
6,1990-1999,5,0.006388,137335
7,1990-1999,10,0.014433,137335
8,1990-1999,20,0.031609,137335
9,2010-2019,5,0.010095,63228


## Save

In [None]:
import wandb

wandb.login()

two_dir=os.path.join(CFG["artifact_root"],"twoTower")
os.makedirs(two_dir,exist_ok=True)

if next(model.parameters()).device.type!="cpu":
    model_cpu=model.to("cpu")
else:
    model_cpu=model

user_embs=model_cpu.forward_user(torch.tensor(user_profiles,dtype=torch.float32)).detach().cpu().numpy()
item_embs=model_cpu.forward_item(torch.tensor(X_item,dtype=torch.float32)).detach().cpu().numpy()

user_index=pd.DataFrame({
    "userId":list(user2idx.keys()),
    "user_idx":list(user2idx.values())
})

item_index=pd.DataFrame({
    "movieId":list(item2idx.keys()),
    "item_idx":list(item2idx.values())
})

np.save(os.path.join(two_dir,"user_embeddings.npy"),user_embs)
np.save(os.path.join(two_dir,"item_embeddings.npy"),item_embs)

user_index.to_csv(os.path.join(two_dir,"user_index.csv"),index=False)
item_index.to_csv(os.path.join(two_dir,"item_index.csv"),index=False)

model_path=os.path.join(two_dir,"two_tower_model.pt")
torch.save(model_cpu.state_dict(),model_path)

two_tower_metadata={
    "model_type":"two_tower_content",
    "embedding_dim":int(user_embs.shape[1]),
    "n_users":int(user_embs.shape[0]),
    "n_items":int(item_embs.shape[0]),
    "rating_threshold":CFG["rating_threshold"],
    "loss":"InfoNCE",
    "negative_sampling":"in_batch",
    "regularization":"adam_weight_decay",
    "early_stopping":{"patience":patience,"min_delta":min_delta},
    "seed":CFG["seed"]
}

with open(os.path.join(two_dir,"two_tower_metadata.json"),"w") as f:
    json.dump(two_tower_metadata,f,indent=2)

run=wandb.init(
    project="JustiFlicks",
    job_type="model",
    name="04_two_tower_baseline",
    reinit=True
)

artifact_dir=os.path.join(run.dir,"two_tower")
os.makedirs(artifact_dir,exist_ok=True)

np.save(os.path.join(artifact_dir,"user_embeddings.npy"),user_embs)
np.save(os.path.join(artifact_dir,"item_embeddings.npy"),item_embs)

user_index.to_csv(os.path.join(artifact_dir,"user_index.csv"),index=False)
item_index.to_csv(os.path.join(artifact_dir,"item_index.csv"),index=False)

torch.save(model_cpu.state_dict(), os.path.join(artifact_dir,"two_tower_model.pt"))

with open(os.path.join(artifact_dir,"two_tower_metadata.json"),"w") as f:
    json.dump(two_tower_metadata,f,indent=2)

artifact=wandb.Artifact(
    name="two_tower_baseline",
    type="model",
    metadata=two_tower_metadata
)

artifact.add_dir(artifact_dir)
run.log_artifact(artifact)
run.finish()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marhaan[0m ([33marhaan-dhirubhai-ambani-institute-of-information-and-com[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20260103_213621-nvlij2xg/files/two_tower)... Done. 0.2s


In [None]:
metrics_path=os.path.join(two_dir,"metrics.json")
slices_path=os.path.join(two_dir,"slices_long.csv")

with open(metrics_path) as f:
    metrics=json.load(f)

run=wandb.init(
    project="JustiFlicks",
    job_type="eval",
    name="04_two_tower_evaluation",
    reinit=True
)

run.log(metrics)

eval_artifact=wandb.Artifact(
    name="two_tower_evaluation",
    type="evaluation",
    metadata={
        "model":"two_tower_content",
        "catalog_restricted":True,
        "evaluation_users":"full" if metrics.get("users_total",0)>50000 else "subsample",
        "primary_metric":"ndcg@10"
    }
)

eval_artifact.add_file(metrics_path)
eval_artifact.add_file(slices_path)

run.log_artifact(eval_artifact)
run.finish()

0,1
catalog_coverage_fraction,▁
evaluation_catalog_items,▁
items_covered,▁
map5@10,▁
map5@20,▁
map5@5,▁
mean_popularity_num_votes@10,▁
mean_popularity_num_votes@20,▁
mean_popularity_num_votes@5,▁
ndcg@10,▁

0,1
catalog_coverage_fraction,0.64149
evaluation_catalog_items,41823
items_covered,26829
map5@10,0.00765
map5@20,0.00965
map5@5,0.00561
mean_popularity_num_votes@10,360082.43381
mean_popularity_num_votes@20,343316.83085
mean_popularity_num_votes@5,369426.15194
ndcg@10,0.01504


# Compare Baselines

In [None]:
import wandb
wandb.login()

device="cuda" if torch.cuda.is_available() else "cpu"

REPRO_CFG=CFG["reproducibility"]

seeds=REPRO_CFG["seeds"]
sample_frac=REPRO_CFG["sample_frac"]
min_users=REPRO_CFG["min_users"]
max_users=REPRO_CFG["max_users"]
k=REPRO_CFG["k"]

In [None]:
test_df=test.copy()
item_support_table=pd.read_parquet(
    os.path.join(CFG["artifact_root"],"splits","item_support_table.parquet")
)

In [None]:
def predict_topk_from_embeddings(user_emb,item_emb,user2idx,idx2item,user_id,k):
    if user_id not in user2idx:
        return []
    uidx=user2idx[user_id]
    scores=user_emb[uidx]@item_emb.T
    topk_idx=np.argsort(-scores)[:k]
    return [idx2item[int(i)] for i in topk_idx]


In [None]:
def run_repro_eval(model_name,artifact_root):
    model_dir=os.path.join(artifact_root,model_name)

    user_emb=np.load(os.path.join(model_dir,"user_embeddings.npy"))
    item_emb=np.load(os.path.join(model_dir,"item_embeddings.npy"))

    user_index=pd.read_csv(os.path.join(model_dir,"user_index.csv"))
    item_index=pd.read_csv(os.path.join(model_dir,"item_index.csv"))

    user2idx=dict(zip(user_index.userId,user_index.user_idx))
    idx2item=dict(zip(item_index.item_idx,item_index.movieId))

    global item2idx
    item2idx=dict(zip(item_index.movieId,item_index.item_idx))

    all_users=np.array(list(user2idx.keys()))

    for seed in seeds:
        rng=np.random.RandomState(seed)
        n_sample=min(max_users,max(min_users,int(len(all_users)*sample_frac)))
        sampled_users=rng.choice(all_users,size=n_sample,replace=False)

        test_sub=test[test.userId.isin(sampled_users)]

        gt={}
        preds={}

        for uid,g in test_sub.groupby("userId"):
            gt[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

        for uid in gt.keys():
            preds[int(uid)]=predict_topk_from_embeddings(
                user_emb,item_emb,user2idx,idx2item,uid,max(CFG["k_values"])
            )

        metrics,_=evaluate_model(
            preds,gt,item_support_table,ks=CFG["k_values"]
        )

        ndcg10=metrics["ndcg@10"]
        print(f"[REPRO] model={model_name} seed={seed} ndcg@10={ndcg10:.6f}")

        run=wandb.init(
            project="JustiFlicks",
            job_type="compare",
            name="01_baseline_ndcg",
            reinit=True,
            settings=wandb.Settings(silent=True),
            config={
                "model":model_name,
                "seed":seed,
                "evaluation_type":"reproducibility",
                "sample_frac":sample_frac
            }
        )

        run.log({"ndcg@10":ndcg10})
        run.log(metrics)

        run.finish()

In [None]:
for model_name in ["MF","implicitBPR","twoTower"]:
    run_repro_eval(model_name,CFG["artifact_root"])

[REPRO] model=MF seed=34 ndcg@10=0.023773
[REPRO] model=MF seed=35 ndcg@10=0.023246
[REPRO] model=MF seed=36 ndcg@10=0.022872
[REPRO] model=implicitBPR seed=34 ndcg@10=0.031074
[REPRO] model=implicitBPR seed=35 ndcg@10=0.033296
[REPRO] model=implicitBPR seed=36 ndcg@10=0.031438
[REPRO] model=twoTower seed=34 ndcg@10=0.015156
[REPRO] model=twoTower seed=35 ndcg@10=0.014536
[REPRO] model=twoTower seed=36 ndcg@10=0.014145


# Archived Experiments (Do Not Run)

## LightFM
- Fails to build on Colab due to Python 3.12 / NumPy 2.0 incompatibility
- No maintained wheels; requires downgrading Python
- Replaced by two-tower content model

In [None]:
if False:
  %pip install lightfm

Collecting lightfm
  Using cached lightfm-1.17.tar.gz (316 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for lightfm (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for lightfm[0m[31m
[0m[?25h  Running setup.py clean for lightfm
Failed to build lightfm
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (lightfm)[0m[31m
[0m

In [None]:
if False:
  import numpy as np
  from scipy.sparse import coo_matrix
  from lightfm import LightFM

  np.random.seed(CFG["seed"])
  rng=np.random.RandomState(CFG["seed"])

  n_users=len(user2idx)
  n_items=len(item2idx)

  rows=train_cf.u.values
  cols=train_cf.i.values
  data=np.ones(len(rows),dtype=np.int32)

  interactions=coo_matrix(
      (data,(rows,cols)),
      shape=(n_users,n_items)
  ).tocsr()

  model=LightFM(
      no_components=32,
      loss="bpr",
      learning_rate=0.01,
      item_alpha=1e-4,
      user_alpha=1e-6,
      random_state=CFG["seed"]
  )

  max_epochs=30
  patience=3
  min_delta=1e-4

  best_metric=-np.inf
  epochs_no_improve=0
  best_state=None

  items_all=np.arange(n_items,dtype=np.int32)

  def predict_topk_lf(model,user_id,k=20):
      if user_id not in user2idx:
          return []
      uidx=user2idx[user_id]
      scores=model.predict(
          np.repeat(uidx,n_items),
          items_all,
          num_threads=4
      )
      topk_idx=np.argsort(-scores)[:k]
      return [idx2item[j] for j in topk_idx]

  val_users_all=val.userId.unique()
  val_users_small=set(
      rng.choice(
          val_users_all,
          size=min(10000,len(val_users_all)),
          replace=False
      )
  )

  for epoch in range(max_epochs):
      model.fit(
          interactions,
          epochs=1,
          num_threads=4,
          verbose=False
      )

      val_predictions={}
      for uid,g in val.groupby("userId"):
          if uid in val_users_small and uid in user2idx:
              val_predictions[int(uid)]=predict_topk_lf(model,int(uid),k=10)

      val_ground_truth={}
      for uid,g in val.groupby("userId"):
          if uid in val_users_small:
              val_ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

      val_metrics,_=evaluate_model(
          val_predictions,
          val_ground_truth,
          item_support_table,
          ks=[10]
      )

      current_metric=val_metrics["ndcg@10"]
      ci_low=val_metrics.get("ndcg@10_ci_low")
      ci_high=val_metrics.get("ndcg@10_ci_high")

      print(
          f"epoch={epoch} "
          f"val_ndcg@10={current_metric:.6f} "
          f"ci=({ci_low:.6f},{ci_high:.6f})"
      )

      if current_metric>best_metric+min_delta:
          best_metric=current_metric
          best_state={k:v.copy() for k,v in model.__dict__.items()}
          epochs_no_improve=0
      else:
          epochs_no_improve+=1

      if epochs_no_improve>=patience:
          print("early stopping triggered")
          break

  if best_state is not None:
      model.__dict__.update(best_state)

ModuleNotFoundError: No module named 'lightfm'

##LightGCN
- Bipartite graph construction for ML-32M exceeded Colab CPU/GPU RAM limits (OOM during adjacency materialization).
- High sparsity (≈99.8%) amplified memory overhead with limited gains over MF baselines at this scale.
- Replaced by implicit BPR for CF recall.

In [None]:
if False:
  train=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train_small.parquet"))
  val=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val_small.parquet"))
  test=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/test_small.parquet"))
  item_support_table=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/item_support_table.parquet"))

  device="cuda" if torch.cuda.is_available() else "cpu"
  torch.manual_seed(CFG["seed"])
  np.random.seed(CFG["seed"])
  rng=np.random.RandomState(CFG["seed"])

  rating_threshold=CFG["rating_threshold"]
  max_epochs=30
  patience=3
  min_delta=1e-4

  train_cf=train[train.rating>=rating_threshold][["userId","movieId"]].copy()

  user_ids=train_cf.userId.unique()
  item_ids=train_cf.movieId.unique()

  links=pd.read_parquet(os.path.join(CFG["data_root"],"movieLinks.parquet"))
  cf_movie_set=set(links.movieId.astype(int))
  item_ids=[m for m in item_ids if m in cf_movie_set]

  user2idx={u:i for i,u in enumerate(user_ids)}
  item2idx={m:i for i,m in enumerate(item_ids)}
  idx2item={i:m for m,i in item2idx.items()}

  train_cf["u"]=train_cf.userId.map(user2idx)
  train_cf["i"]=train_cf.movieId.map(item2idx)

  n_users=len(user2idx)
  n_items=len(item2idx)
  n_nodes=n_users+n_items

  u_nodes=train_cf.u.values
  i_nodes=train_cf.i.values+n_users

  edge_u=np.concatenate([u_nodes,i_nodes])
  edge_v=np.concatenate([i_nodes,u_nodes])
  edge_index=torch.tensor(
      np.vstack([edge_u,edge_v]),
      dtype=torch.long,
      device=device
  )

  deg=torch.zeros(n_nodes,device=device)
  deg.scatter_add_(0,edge_index[0],torch.ones(edge_index.size(1),device=device))
  deg_inv_sqrt=deg.pow(-0.5)
  deg_inv_sqrt[deg_inv_sqrt==float("inf")]=0.0
  norm=edge_index.clone()
  norm_weight=deg_inv_sqrt[norm[0]]*deg_inv_sqrt[norm[1]]

  class LightGCN(nn.Module):
      def __init__(self,n_users,n_items,dim,K):
          super().__init__()
          self.n_users=n_users
          self.n_items=n_items
          self.K=K
          self.emb=nn.Embedding(n_users+n_items,dim)
          nn.init.normal_(self.emb.weight,std=0.01)

      def propagate(self,edge_index,edge_weight):
          all_emb=[self.emb.weight]
          x=self.emb.weight
          for _ in range(self.K):
              x=torch.zeros_like(x).index_add_(
                  0,
                  edge_index[0],
                  x[edge_index[1]]*edge_weight.unsqueeze(1)
              )
              all_emb.append(x)
          all_emb=torch.stack(all_emb,dim=0)
          return all_emb.mean(dim=0)

      def forward(self,edge_index,edge_weight):
          out=self.propagate(edge_index,edge_weight)
          return out[:self.n_users],out[self.n_users:]

  def bpr_loss(u_emb,pos_emb,neg_emb,lambda_reg):
      pos_scores=(u_emb*pos_emb).sum(dim=1)
      neg_scores=(u_emb*neg_emb).sum(dim=1)
      loss=-F.logsigmoid(pos_scores-neg_scores).mean()
      reg=(u_emb.norm(2).pow(2)+pos_emb.norm(2).pow(2)+neg_emb.norm(2).pow(2))/u_emb.size(0)
      return loss+lambda_reg*reg

  user_pos=defaultdict(list)
  for u,i in zip(train_cf.u.values,train_cf.i.values):
      user_pos[u].append(i)

  item_counts=pd.Series(train_cf.i).value_counts().reindex(range(n_items)).fillna(0).values
  alpha=0.75
  probs=(item_counts**alpha)
  probs=probs/probs.sum()

  def sample_batch(bs):
      users=rng.choice(list(user_pos.keys()),size=bs)
      pos=[]
      neg=[]
      for u in users:
          p=rng.choice(user_pos[u])
          n=rng.choice(n_items,p=probs)
          while n in user_pos[u]:
              n=rng.choice(n_items,p=probs)
          pos.append(p)
          neg.append(n)
      return torch.tensor(users,device=device),torch.tensor(pos,device=device),torch.tensor(neg,device=device)

  val_users_all=val.userId.unique()
  val_users_small=set(rng.choice(val_users_all,size=min(10000,len(val_users_all)),replace=False))

  def predict_topk_lgcn(user_emb,item_emb,user_id,k=10):
      if user_id not in user2idx:
          return []
      u=user2idx[user_id]
      scores=torch.matmul(item_emb,user_emb[u]).cpu().numpy()
      topk=np.argsort(-scores)[:k]
      return [idx2item[i] for i in topk]

  best_metric=-np.inf
  epochs_no_improve=0
  best_state=None

  K=3
  dim=64
  lambda_reg=1e-4

  model=LightGCN(n_users,n_items,dim,K).to(device)
  opt=torch.optim.Adam(model.parameters(),lr=1e-3)

  for epoch in range(max_epochs):
      model.train()
      losses=[]
      for _ in range(len(train_cf)//4096):
          u,p,n=sample_batch(4096)
          user_emb,item_emb=model(edge_index,norm_weight)
          loss=bpr_loss(
              user_emb[u],
              item_emb[p],
              item_emb[n],
              lambda_reg
          )
          opt.zero_grad()
          loss.backward()
          opt.step()
          losses.append(loss.item())

      model.eval()
      with torch.no_grad():
          user_emb,item_emb=model(edge_index,norm_weight)

          val_predictions={}
          for uid,g in val.groupby("userId"):
              if uid in val_users_small and uid in user2idx:
                  val_predictions[int(uid)]=predict_topk_lgcn(
                      user_emb,
                      item_emb,
                      int(uid),
                      k=10
                  )

          val_ground_truth={}
          for uid,g in val.groupby("userId"):
              if uid in val_users_small:
                  val_ground_truth[int(uid)]={int(r.movieId):float(r.rating) for _,r in g.iterrows()}

          val_metrics,_=evaluate_model(
              val_predictions,
              val_ground_truth,
              item_support_table,
              ks=[10]
          )

          current_metric=val_metrics["ndcg@10"]
          ci_low=val_metrics.get("ndcg@10_ci_low")
          ci_high=val_metrics.get("ndcg@10_ci_high")

      print(
          f"epoch={epoch} "
          f"loss={np.mean(losses):.4f} "
          f"val_ndcg@10={current_metric:.6f} "
          f"ci=({ci_low:.6f},{ci_high:.6f})"
      )

      if current_metric>best_metric+min_delta:
          best_metric=current_metric
          best_state={
              "model":model.state_dict(),
              "user_emb":user_emb.cpu(),
              "item_emb":item_emb.cpu()
          }
          epochs_no_improve=0
      else:
          epochs_no_improve+=1

      if epochs_no_improve>=patience:
          print("early stopping triggered")
          break

  if best_state is not None:
      model.load_state_dict(best_state["model"])

## RecBole LightGCN
- Difficulty integrating native MovieLens support with required temporal per-user splits and custom evaluation.
- Multiple dependency conflicts on Colab (Ray, NumPy 2.0, PyTorch distributed barriers).
- Internal RecBole dependency error.
- Native RecBole datasets arent up-to-date with movielens (only has 20M, no 32M)

In [None]:
if False:
  import os
  import pandas as pd
  import numpy as np
  import torch
  from recbole.quick_start import run_recbole

  SEED=CFG["seed"]
  np.random.seed(SEED)
  torch.manual_seed(SEED)

  BASE_DIR=os.path.join(CFG["artifact_root"],"lightGCN")
  DATASET="ml32m_small"
  DATA_DIR=os.path.join(BASE_DIR,DATASET)
  os.makedirs(DATA_DIR,exist_ok=True)

  train_df=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train_small.parquet"))
  val_df=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val_small.parquet"))
  test_df=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/test_small.parquet"))

  cols=["userId","movieId","rating","datetime"]

  train_df=train_df[cols].copy()
  val_df=val_df[cols].copy()
  test_df=test_df[cols].copy()

  train_df["split"]="train"
  val_df["split"]="valid"
  test_df["split"]="test"

  full_df=pd.concat([train_df,val_df,test_df],ignore_index=True)

  full_df=full_df.rename(columns={
      "userId":"user_id",
      "movieId":"item_id",
      "datetime":"timestamp"
  })

  full_df["timestamp"]=pd.to_datetime(full_df["timestamp"]).astype("int64")//10**9

  inter_path=os.path.join(DATA_DIR,f"{DATASET}.inter")
  full_df.to_csv(inter_path,sep="\t",index=False)

  config={
      "data_path":BASE_DIR,
      "dataset":DATASET,

      "USER_ID_FIELD":"user_id",
      "ITEM_ID_FIELD":"item_id",
      "RATING_FIELD":"rating",
      "TIME_FIELD":"timestamp",

      "load_col":{
          "interactions":["user_id","item_id","rating","timestamp","split"]
      },

      "eval_args":{
          "split":{"LS":"split"},
          "order":"TO",
          "group_by":"user"
      },

      "neg_sampling":None,
      "loss_type":"BPR",

      "embedding_size":64,
      "n_layers":3,
      "reg_weight":1e-4,
      "learning_rate":1e-3,
      "epochs":30,

      "train_batch_size":2048,
      "eval_batch_size":4096,

      "valid_metric":"NDCG@10",
      "stopping_step":3,
      "topk":[10],

      "seed":SEED,
      "device":"cuda" if torch.cuda.is_available() else "cpu",
      "save_model":True,
      "checkpoint_dir":"/content/drive/MyDrive/recbole_ckpt"
  }

  run_recbole(
      model="LightGCN",
      dataset=DATASET,
      config_dict=config
  )

In [None]:
if False:
  !pip install recbole



In [None]:
if False:
  !pip install ray[tune]



In [None]:
if False:
  DATASET="ml32m_custom"
  BASE_DIR="/content/recbole"
  DATA_DIR=os.path.join(BASE_DIR,DATASET)
  os.makedirs(DATA_DIR,exist_ok=True)

  train=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/train_small.parquet"))
  val=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/val_small.parquet"))
  test=pd.read_parquet(os.path.join(CFG["artifact_root"],"splits/test_small.parquet"))
  rating_threshold=CFG["rating_threshold"]
  train_cf=train[train.rating>=rating_threshold].copy()
  train_df=pd.DataFrame({"user_id":train_cf.userId.astype(int).values,"item_id":train_cf.itemId.astype(int).values,"rating":np.ones(len(train_cf),dtype=float),"timestamp":np.zeros(len(train_cf),dtype=int),"split":["train"]*len(train_cf)})


AttributeError: 'DataFrame' object has no attribute 'u'

In [None]:
if False:
  !pip install "numpy<2.0"

Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.

In [None]:
if False:
  !pip install kmeans-pytorch

Collecting kmeans-pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl.metadata (1.6 kB)
Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans-pytorch
Successfully installed kmeans-pytorch-0.3


In [None]:
if False:
  from recbole.quick_start import run_recbole
  import torch
  import numpy as np
  import pandas as pd
  import os

  SEED=CFG["seed"]

  config={
      "dataset":"ml-20m",
      "distributed": False,

      "USER_ID_FIELD":"user_id",
      "ITEM_ID_FIELD":"item_id",
      "RATING_FIELD":"rating",
      "TIME_FIELD":"timestamp",

      "load_col":{
          "interactions":["user_id","item_id","rating","timestamp"]
      },

      "eval_args":{
          "split":{"LS":"by_time","num":[5,5]},
          "order":"TO",
          "group_by":"user"
      },

      "loss":"bpr",
      "embedding_size":64,
      "n_layers":3,
      "reg_weight":1e-4,
      "learning_rate":1e-3,
      "epochs":30,
      "train_batch_size":2048,
      "eval_batch_size":4096,

      "valid_metric":"NDCG@10",
      "stopping_step":3,
      "topk":[10],

      "seed":CFG["seed"],
      "device":"cuda" if torch.cuda.is_available() else "cpu",
      "save_model":True,
      "checkpoint_dir":os.path.join(CFG["artifact_root"],"lightGCN")
  }


  result=run_recbole(model="LightGCN",dataset="ml-20m",config_dict=config)

  try:
      model,dataset,train_data,valid_data,test_data,rec_config,device=result[:7]
  except Exception:
      model,dataset,train_data,valid_data,test_data,rec_config=result[:6]

  os.makedirs(os.path.join(artifact_root,"lightGCN"),exist_ok=True)
  torch.save(model.state_dict(),os.path.join(artifact_root,"lightGCN","lightgcn_final.pt"))

  user_emb=model.get_user_embedding().detach().cpu().numpy()
  item_emb=model.get_item_embedding().detach().cpu().numpy()

  item_token2id=dataset.field2token_id["item_id"]
  item_id2token={v:k for k,v in item_token2id.items()}

  item_idx=np.arange(item_emb.shape[0],dtype=int)
  movie_ids=[int(item_id2token[i]) for i in item_idx]

  item_map=pd.DataFrame({"item_idx":item_idx,"movieId":movie_ids})
  item_map=item_map.merge(links[["movieId","imdbId"]],on="movieId",how="left")

  item_emb_df=pd.DataFrame(item_emb,columns=[f"emb_{i}" for i in range(item_emb.shape[1])])
  item_out=pd.concat([item_map.reset_index(drop=True),item_emb_df.reset_index(drop=True)],axis=1)
  item_out.to_parquet(os.path.join(artifact_root,"lightGCN","item_embeddings.parquet"),index=False)

  user_idx=np.arange(user_emb.shape[0],dtype=int)
  user_tokens=[int(dataset.id2token(dataset.uid_field,i)) for i in user_idx]
  user_map=pd.DataFrame({"user_idx":user_idx,"userId":user_tokens})
  user_emb_df=pd.DataFrame(user_emb,columns=[f"emb_{i}" for i in range(user_emb.shape[1])])
  user_out=pd.concat([user_map.reset_index(drop=True),user_emb_df.reset_index(drop=True)],axis=1)
  user_out.to_parquet(os.path.join(artifact_root,"lightGCN","user_embeddings.parquet"),index=False)

  print("lightgcn_ml32m_done")



AttributeError: 'NoneType' object has no attribute 'index'