<a href="https://colab.research.google.com/github/Candice-24/Movie-Cold-Start-Item-Solution-with-NLP-model/blob/main/NLP_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cold Start Project Summary

The item cold-start problem — how to recommend a newly added item (e.g., a new movie) without any historical user interactions (ratings, views) — is a fundamental challenge in recommender systems. Traditional collaborative filtering methods fail because they depend on prior ratings or viewing histories. This project aims to solve this problem by building a content-based system to identify and rank the existing users most likely to be the "first" and most receptive audience for a new, unseen item.

# Import package

In [5]:
!pip install numpy pandas scikit-learn tmdbsimple
!pip install -q sentence-transformers faiss-cpu tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import faiss
import requests
import math # For metrics
import re
from scipy.sparse import vstack, csr_matrix

# MovieLens Latest dataset

In [7]:
!gdown 1I7UYj-NpIUKrcQ16dv6YXkFJd6h3lSLz # MovieLens link.csv
!gdown 1hhGbrgERPczXEpNSzS8wXSDRs4dkniiZ # MovieLens movies.csv
!gdown 1YOoWaas0sTTGYOZlNmd0mlH_uMWRoKyt # MovieLens ratings.csv
!gdown 1fXH9A4KWZk_m1AUyp_6t-1ewYtrjEEb9 # MovieLens tags.csv

Downloading...
From: https://drive.google.com/uc?id=1I7UYj-NpIUKrcQ16dv6YXkFJd6h3lSLz
To: /content/links.csv
100% 198k/198k [00:00<00:00, 4.23MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hhGbrgERPczXEpNSzS8wXSDRs4dkniiZ
To: /content/movies.csv
100% 494k/494k [00:00<00:00, 5.83MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YOoWaas0sTTGYOZlNmd0mlH_uMWRoKyt
To: /content/ratings.csv
100% 2.48M/2.48M [00:00<00:00, 18.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fXH9A4KWZk_m1AUyp_6t-1ewYtrjEEb9
To: /content/tags.csv
100% 119k/119k [00:00<00:00, 3.55MB/s]


In [8]:
MovieLink = pd.read_csv('/content/links.csv') # movieId, imdbId, tmdbId
MovieID = pd.read_csv('/content/movies.csv') # movieId, title, genres
MovieRating = pd.read_csv('/content/ratings.csv') # userId, movieId, rating, timestamp
MovieTag = pd.read_csv('/content/tags.csv') # userId, movieId, tag, timestamp

In [10]:
print(MovieID.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [11]:
print(MovieRating.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [12]:
print(MovieTag.head())

   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200


In [13]:
print(MovieLink.head())

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


# TMDB metadata acquisition

In [35]:
USE_TMDB = True # Define USE_TMDB flag
TMDB_API_KEY = "1958fdb6762bc7b4c44677d7b550aeee"
TMDB_BASE_URL = "https://api.themoviedb.org/3/movie/"

In [15]:
def fetch_tmdb_metadata_for_movie(movie_id, tmdb_id, api_key=TMDB_API_KEY):
  """
    Call TMDB API for a single movie and return a dictionary of metadata.
    Use append_to_response=credits,keywords to get everything in one call.
  """
  if pd.isna(tmdb_id):
        return None  # no TMDB id available

  try:
      url = f"{TMDB_BASE_URL}{int(tmdb_id)}"
      params = {"api_key": api_key,"append_to_response": "credits,keywords"}
      r = requests.get(url, params=params)
      if r.status_code != 200:
          return None
      data = r.json()

      # 1. Director (from credits.crew where job == "Director")
      director = None
      credits = data.get("credits", {})
      crew = credits.get("crew", [])
      for c in crew:
          if c.get("job") == "Director":
              director = c.get("name")
              break

      # 2. Top-5 cast names
      cast_list = credits.get("cast", [])
      cast_names = [c.get("name") for c in cast_list[:5] if c.get("name")]

      # 3. Genres (text names)
      genre_list = data.get("genres", [])
      genre_names = [g.get("name") for g in genre_list if g.get("name")]

      # 4. Keywords
      kw_obj = data.get("keywords", {})
      kw_list = kw_obj.get("keywords", [])  # TMDB returns {"keywords":[...]}
      keyword_names = [k.get("name") for k in kw_list if k.get("name")]

      # 5. Other useful fields
      overview = data.get("overview")
      release_date = data.get("release_date")
      runtime = data.get("runtime")
      vote_average = data.get("vote_average")

      return {
          "movieId": movie_id,
          "tmdbId": tmdb_id,
          "overview": overview,
          "director": director,
          "cast": ", ".join(cast_names),
          "genres_tmdb": "|".join(genre_names),
          "keywords_tmdb": " ".join(keyword_names),
          "release_date": release_date,
          "runtime": runtime,
          "vote_average": vote_average,
        }

  except Exception as e:
      # return None if it fails
      print(f"Error fetching TMDB for movieId={movie_id}, tmdbId={tmdb_id}: {e}")
      return None

In [16]:
def build_tmdb_metadata_table(links_df, max_movies=None):
    """
    Iterate over links.csv, call TMDB for each tmdbId, and build a metadata DataFrame.
    """
    rows = []
    links_iter = links_df[['movieId', 'tmdbId']].dropna()

    if max_movies is not None:
        links_iter = links_iter.head(max_movies)

    for _, row in tqdm(links_iter.iterrows(), total=len(links_iter), desc="Fetching TMDB"):
        movie_id = row['movieId']
        tmdb_id = row['tmdbId']
        meta = fetch_tmdb_metadata_for_movie(movie_id, tmdb_id)
        if meta is not None:
            rows.append(meta)

    if not rows:
        return pd.DataFrame()

    meta_df = pd.DataFrame(rows)
    return meta_df.set_index("movieId")


In [17]:
tmdb_meta = build_tmdb_metadata_table(MovieLink)
print("TMDB metadata shape:", tmdb_meta.shape)
tmdb_meta.head()

Fetching TMDB:   0%|          | 0/9734 [00:00<?, ?it/s]

TMDB metadata shape: (9622, 9)


Unnamed: 0_level_0,tmdbId,overview,director,cast,genres_tmdb,keywords_tmdb,release_date,runtime,vote_average
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,862.0,"Led by Woody, Andy's toys live happily in his ...",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Family|Comedy|Animation|Adventure,rescue friendship mission jealousy villain bul...,1995-11-22,81,8.0
2.0,8844.0,When siblings Judy and Peter discover an encha...,Joe Johnston,"Robin Williams, Kirsten Dunst, Bradley Pierce,...",Adventure|Fantasy|Family,giant insect board game disappearance jungle r...,1995-12-15,104,7.243
3.0,15602.0,A family wedding reignites the ancient feud be...,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",Romance|Comedy,fishing sequel old man best friend wedding ita...,1995-12-22,101,6.483
4.0,31357.0,"Cheated on, mistreated and stepped on, the wom...",Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devin...",Comedy|Drama|Romance,based on novel or book single mother divorce a...,1995-12-22,127,6.3
5.0,11862.0,Just when George Banks has recovered from his ...,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Comedy|Family,daughter baby parent child relationship midlif...,1995-12-08,106,6.3


# Build unified movie-level content

Combine: MovieLens genres + tags + TMDB director/cast/keywords/overview

This is the "Movie Document" (for TF-IDF) and the semantic description (for SBERT).

In [18]:
# Aggregate tags by movieId into one string
tags_agg = (
    MovieTag
    .groupby("movieId")["tag"]
    .apply(lambda x: " ".join(str(t).replace(" ", "_") for t in x))
    .rename("all_tags")
    .reset_index()
)

In [45]:
# Merge movies.csv with tags and TMDB metadata
movies_full = (MovieID.merge(tags_agg, on="movieId", how="left")
.merge(tmdb_meta, on="movieId", how="left"))  # tmdb_meta is indexed on movieId

movies_full.head()

Unnamed: 0,movieId,title,genres,all_tags,tmdbId,overview,director,cast,genres_tmdb,keywords_tmdb,release_date,runtime,vote_average
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun,862.0,"Led by Woody, Andy's toys live happily in his ...",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Family|Comedy|Animation|Adventure,rescue friendship mission jealousy villain bul...,1995-11-22,81.0,8.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic_board_game Robin_Williams game,8844.0,When siblings Judy and Peter discover an encha...,Joe Johnston,"Robin Williams, Kirsten Dunst, Bradley Pierce,...",Adventure|Fantasy|Family,giant insect board game disappearance jungle r...,1995-12-15,104.0,7.243
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old,15602.0,A family wedding reignites the ancient feud be...,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",Romance|Comedy,fishing sequel old man best friend wedding ita...,1995-12-22,101.0,6.483
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,31357.0,"Cheated on, mistreated and stepped on, the wom...",Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devin...",Comedy|Drama|Romance,based on novel or book single mother divorce a...,1995-12-22,127.0,6.3
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,11862.0,Just when George Banks has recovered from his ...,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Comedy|Family,daughter baby parent child relationship midlif...,1995-12-08,106.0,6.3


In [48]:
# Extract release_year for temporal split

def get_release_year(row):
    """
    Try TMDB release_date first; if not available, parse year from title "(1995)".
    """
    # From TMDB
    if USE_TMDB:
        rd = row.get("release_date", None)
        if pd.notna(rd):
            try:
                return int(str(rd)[:4])
            except:
                pass
    # Fallback: year in title, e.g., Toy Story (1995)
    title = str(row.get("title", ""))
    m = re.search(r"\((\d{4})\)\s*$", title)
    if m:
        return int(m.group(1))
    return None

In [22]:
def build_movie_document(row):
    """
    Build a "bag-of-words" style document for TF-IDF:
    - tag features like genre_, director_, cast_, kw_ for interpretability.
    - Then append the raw text (title + overview).
    """
    tokens = []

    # Genres from MovieLens
    genres_ml = str(row.get("genres", ""))
    if genres_ml:
        for g in genres_ml.split("|"):
            g = g.strip()
            if g and g != "(no genres listed)":
                tokens.append("genre_" + g.lower().replace(" ", ""))

    # TMDB genres (optional, often overlapping)
    genres_tmdb = str(row.get("genres_tmdb", ""))
    if genres_tmdb and genres_tmdb != "nan":
        for g in genres_tmdb.split("|"):
            g = g.strip()
            if g:
                tokens.append("genre_" + g.lower().replace(" ", ""))

    # Tags from tags.csv
    tags = str(row.get("all_tags", ""))
    if tags and tags != "nan":
        for t in tags.split():
            tokens.append("tag_" + t.lower())

    # Director
    director = str(row.get("director", ""))
    if director and director != "nan":
        name_clean = director.lower().replace(" ", "").replace("-", "")
        tokens.append("director_" + name_clean)

    # Top-5 cast names
    cast = str(row.get("cast", ""))
    if cast and cast != "nan":
        for name in cast.split(","):
            n = name.strip()
            if n:
                n_clean = n.lower().replace(" ", "").replace("-", "")
                tokens.append("cast_" + n_clean)

    # Keywords from TMDB
    keywords_tmdb = str(row.get("keywords_tmdb", ""))
    if keywords_tmdb and keywords_tmdb != "nan":
        for k in keywords_tmdb.split():
            tokens.append("kw_" + k.lower())

    # Title + overview as raw text (semantic content)
    title = str(row.get("title", ""))
    overview = str(row.get("overview", ""))
    text_block = f"title {title} . plot {overview}"

    # Final document is tagged tokens + raw text
    return " ".join(tokens + [text_block])

In [23]:
def build_semantic_description(row):
    """
    Build a natural-language sentence/paragraph for SBERT:
    e.g. "Inception is an action and sci-fi film directed by Christopher Nolan,
         starring Leonardo DiCaprio. It includes themes like dream, heist. Plot summary: ..."
    """
    title = str(row.get("title", "Unknown title"))

    # Combine genres from MovieLens and TMDB
    genres_ml = [g for g in str(row.get("genres", "")).split("|") if g and g != "(no genres listed)"]
    genres_tmdb = [g for g in str(row.get("genres_tmdb", "")).split("|") if g]
    all_genres = list(dict.fromkeys(genres_ml + genres_tmdb))  # remove duplicates, keep order
    genres_text = ", ".join(all_genres) if all_genres else "unspecified genre"

    director = row.get("director", None)
    cast = row.get("cast", None)
    keywords_tmdb = row.get("keywords_tmdb", None)
    overview = row.get("overview", None)

    # Start description
    desc_parts = []
    base = f"{title} is a {genres_text} film"
    if director and str(director) != "nan":
        base += f" directed by {director}"
    if cast and str(cast) != "nan":
        base += f", starring {cast}"
    base += "."
    desc_parts.append(base)

    if keywords_tmdb and str(keywords_tmdb) != "nan":
        desc_parts.append(f"It includes themes like {keywords_tmdb}.")

    if overview and str(overview) != "nan":
        desc_parts.append(f"Plot summary: {overview}")

    return " ".join(desc_parts)

In [49]:
movies_full["tfidf_doc"] = movies_full.apply(build_movie_document, axis=1)
movies_full["sbert_desc"] = movies_full.apply(build_semantic_description, axis=1)
movies_full["release_year"] = movies_full.apply(get_release_year, axis=1)
movies_full[["title", "release_year", "tfidf_doc", "sbert_desc"]].head()

Unnamed: 0,title,release_year,tfidf_doc,sbert_desc
0,Toy Story (1995),1995.0,genre_adventure genre_animation genre_children...,"Toy Story (1995) is a Adventure, Animation, Ch..."
1,Jumanji (1995),1995.0,genre_adventure genre_children genre_fantasy g...,"Jumanji (1995) is a Adventure, Children, Fanta..."
2,Grumpier Old Men (1995),1995.0,genre_comedy genre_romance genre_romance genre...,"Grumpier Old Men (1995) is a Comedy, Romance f..."
3,Waiting to Exhale (1995),1995.0,genre_comedy genre_drama genre_romance genre_c...,"Waiting to Exhale (1995) is a Comedy, Drama, R..."
4,Father of the Bride Part II (1995),1995.0,genre_comedy genre_comedy genre_family tag_pre...,Father of the Bride Part II (1995) is a Comedy...


# Train/Test split on RATINGS

Training Set: all ratings before 2017-01-01.

Test Set (Cold): all ratings for items with release_year >= 2017 from 2017-01-01 onward.

In [50]:
print("Min rating time:", MovieRating["timestamp"].min())
print("Max rating time:", MovieRating["timestamp"].max())

Min rating time: 828124615
Max rating time: 1537799250


In [52]:
for year in range(2005, 2021):
    start = pd.Timestamp(f"{year}-01-01").timestamp() # Convert to Unix timestamp
    end   = pd.Timestamp(f"{year+1}-01-01").timestamp() # Convert to Unix timestamp
    cnt = MovieRating[(MovieRating["timestamp"] >= start) &
                      (MovieRating["timestamp"] < end)].shape[0]
    print(year, cnt)

2005 5813
2006 4059
2007 7114
2008 4351
2009 4158
2010 2301
2011 1690
2012 4656
2013 1664
2014 1439
2015 6616
2016 6703
2017 8198
2018 6418
2019 0
2020 0


In [80]:
# Define cold items by release year
cold_item_mask = movies_full["release_year"].notna() & (movies_full["release_year"] >= 2014)
cold_item_ids  = set(movies_full.index[cold_item_mask])

print("Number of cold items (release_year >= 2014):", len(cold_item_ids))

# Training ratings: before cutoff_date (1996–2016). We can include all items
train_ratings = MovieRating[~MovieRating["movieId"].isin(cold_item_ids)].copy()

# Test ratings: for cold items, from cutoff_date onward
test_ratings = MovieRating[MovieRating["movieId"].isin(cold_item_ids)].copy()

print("Train ratings:", train_ratings.shape)
print("Test ratings :", test_ratings.shape)

# Only positive ratings are used for preference modeling / ground truth
positive_train = train_ratings[train_ratings["rating"] >= 4.0].copy()
positive_test  = test_ratings[test_ratings["rating"] >= 4.0].copy()

print("Positive train:", positive_train.shape)
print("Positive test :", positive_test.shape)

Number of cold items (release_year >= 2014): 975
Train ratings: (99496, 4)
Test ratings : (1340, 4)
Positive train: (47998, 4)
Positive test : (582, 4)


# TF-IDF path

## Build Item Profile Matrix

In [81]:
# Map each movieId to a row index in the TF-IDF matrix
movie_ids = movies_full.index.values
movieId_to_idx = {mid: idx for idx, mid in enumerate(movie_ids)}
idx_to_movieId = {idx: mid for mid, idx in movieId_to_idx.items()}

# TF-IDF corpus: one document string per movie
tfidf_corpus = movies_full["tfidf_doc"].fillna("").tolist()

tfidf_vectorizer = TfidfVectorizer(
    min_df=5, # ignore tokens that appear in fewer than 5 movies
    max_df=0.8, # ignore tokens that appear in more than 80% of movies
    ngram_range=(1, 2),
    norm='l2'
)

# item_tfidf: shape [num_movies, num_features]
item_tfidf = tfidf_vectorizer.fit_transform(tfidf_corpus)
print("TF-IDF item matrix shape:", item_tfidf.shape)

TF-IDF item matrix shape: (9742, 31176)


## Build User Profile Matrix

For each train user, I build a TF-IDF user vector as: weighted average of the item TF-IDF vectors for the movies they rated >= 4.

In [82]:
# Only use users who have at least one positive rating in train
train_user_ids = sorted(positive_train["userId"].unique())

# Map user index <-> userId
idx_to_userId = {idx: uid for idx, uid in enumerate(train_user_ids)}
userId_to_idx = {uid: idx for idx, uid in idx_to_userId.items()}

In [83]:
user_vectors_sparse = []

for uid in tqdm(train_user_ids, desc="Building TF-IDF user profiles"):
    ur = positive_train[positive_train["userId"] == uid]
    ur = ur[ur["movieId"].isin(movieId_to_idx.keys())]

    if ur.empty:
        zero_vec = csr_matrix((1, item_tfidf.shape[1]))
        user_vectors_sparse.append(zero_vec)
        continue

    # Convert movieIds to row indices in item_tfidf
    movie_indices = [movieId_to_idx[mid] for mid in ur["movieId"].values]
    ratings = ur["rating"].values.reshape(-1, 1) # shape [n_ratings, 1]

    # Sub-matrix for items the user liked
    item_sub = item_tfidf[movie_indices] # shape [n_ratings, n_features]

    # Weight each movie vector by its rating (5*vector more than 4*vector)
    weighted_item_sub = item_sub.multiply(ratings)

    # Weighted average = sum(weighted vectors) / sum(ratings)
    user_vec = weighted_item_sub.sum(axis=0) / ratings.sum()

    # Convert the numpy.matrix result back to a csr_matrix for vstack compatibility
    user_vectors_sparse.append(csr_matrix(user_vec))

# Stack all user vectors into one big sparse matrix
user_profile_tfidf = vstack(user_vectors_sparse)

# L2-normalize user vectors (so cosine similarity = dot product)
user_profile_tfidf = normalize(user_profile_tfidf, norm='l2', axis=1)

print("User TF-IDF profile matrix shape:", user_profile_tfidf.shape)

Building TF-IDF user profiles:   0%|          | 0/609 [00:00<?, ?it/s]

User TF-IDF profile matrix shape: (609, 31176)


# SBERT path via all-MiniLM-L6-v2 model

## Build Item Embedding Matrix

In [84]:
# Load all-MiniLM-L6-v2 SBERT model
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# SBERT corpus: one semantic description per movie
sbert_corpus = movies_full["sbert_desc"].fillna("").tolist()

# item_sbert: dense array [num_movies, embedding_dim]
item_sbert = sbert_model.encode(sbert_corpus, show_progress_bar=True)
item_sbert = normalize(item_sbert, norm='l2', axis=1) # L2-normalize by row

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [85]:
print("SBERT item embedding matrix shape:", item_sbert.shape)

SBERT item embedding matrix shape: (9742, 384)


## Build User Profile Matrix

In [86]:
emb_dim = item_sbert.shape[1]

# Initialize user_profile_sbert as zeros
# shape: [num_train_users, embedding_dim]
user_profile_sbert = np.zeros((len(train_user_ids), emb_dim), dtype=np.float32)

In [87]:
# For each user, again do a weighted average of item embeddings
for idx_u, uid in enumerate(tqdm(train_user_ids, desc="Building SBERT user profiles")):
    ur = positive_train[positive_train["userId"] == uid]
    ur = ur[ur["movieId"].isin(movieId_to_idx.keys())]

    if ur.empty:
        continue

    movie_indices = [movieId_to_idx[mid] for mid in ur["movieId"].values]
    ratings = ur["rating"].values.astype(np.float32)

    item_embeds = item_sbert[movie_indices]  # [n_ratings, emb_dim]

    # Multiply each embedding by the rating
    weighted = (item_embeds.T * ratings).T
    user_vec = weighted.sum(axis=0) / ratings.sum()

    user_profile_sbert[idx_u] = user_vec

# L2-normalize user SBERT profiles
user_profile_sbert = normalize(user_profile_sbert, norm='l2', axis=1)

Building SBERT user profiles:   0%|          | 0/609 [00:00<?, ?it/s]

In [88]:
print("SBERT user profile matrix shape:", user_profile_sbert.shape)

SBERT user profile matrix shape: (609, 384)


## Build FAISS index over SBERT user profiles

In [89]:
# FAISS index for inner product (works as cosine similarity if vectors are normalized)
d = emb_dim
index = faiss.IndexFlatIP(d)

# Add user vectors to FAISS index
index.add(user_profile_sbert.astype(np.float32))

print("FAISS index built with", index.ntotal, "user vectors.")

FAISS index built with 609 user vectors.


# Inference functions

Given a NEW cold movie, find top-K users

In [90]:
def find_target_users_for_new_item_tfidf(new_item_doc, k=20):
    """
    TF-IDF pipeline:
      1. Vectorize the new item document using the learned TfidfVectorizer.
      2. Compute similarity against all user TF-IDF profiles.
      3. Return top-K (userId, similarity_score).
    """
    # Step 1: get TF-IDF vector for this single document
    new_vec = tfidf_vectorizer.transform([new_item_doc])
    new_vec = normalize(new_vec, norm='l2', axis=1)

    # Step 2: cosine similarity = dot product (since everything is normalized)
    sims = new_vec @ user_profile_tfidf.T # shape [1, num_users]
    sims_array = np.asarray(sims.toarray()).ravel()

    # Step 3: sort users by similarity, descending, pick top-K
    topk_idx = sims_array.argsort()[::-1][:k]
    topk_scores = sims_array[topk_idx]
    topk_user_ids = [idx_to_userId[i] for i in topk_idx]

    return list(zip(topk_user_ids, topk_scores))

In [73]:
def find_target_users_for_new_item_sbert(new_item_desc, k=20):
    """
    SBERT pipeline:
      1. Encode the new item's natural-language description with SBERT.
      2. L2-normalize the embedding.
      3. Query FAISS to find top-K similar users.
    """
    # Step 1: encode description to SBERT embedding
    new_emb = sbert_model.encode([new_item_desc])
    new_emb = normalize(new_emb, norm='l2', axis=1).astype(np.float32)

    # Step 2: FAISS inner product search
    scores, indices = index.search(new_emb, k) # scores: [1,k], indices: [1,k]
    scores = scores[0]
    indices = indices[0]

    # Step 3: map row indices back to userIds
    topk_user_ids = [idx_to_userId[i] for i in indices]
    return list(zip(topk_user_ids, scores))

# Validation Framework for Cold-Start Performance

## Ranking metrics: Precision@K, Recall@K, NDCG@K

In [74]:
def precision_at_k(recommended, relevant, k):
    """
    Precision@K = (# of recommended users in top-K who are actually relevant)/ K
    recommended: ranked list of userIds from your model
    relevant: set of userIds who truly liked this movie (rating>=4 in test)
    """
    if k == 0:
        return 0.0
    rec_k = recommended[:k]
    hits = sum(1 for u in rec_k if u in relevant)
    return hits / k

In [75]:
def recall_at_k(recommended, relevant, k):
    """
    Recall@K = (# of recommended users in top-K who are actually relevant)
    / (# of all relevant users)
    """
    if len(relevant) == 0:
        return 0.0
    rec_k = recommended[:k]
    hits = sum(1 for u in rec_k if u in relevant)
    return hits / len(relevant)

In [76]:
def ndcg_at_k(recommended, relevant, k):
    """
    NDCG@K (Normalized Discounted Cumulative Gain) with binary relevance:
      - relevant user = gain 1
      - non-relevant user = gain 0
      - gains are discounted by log2(rank+1)
    """
    rec_k = recommended[:k]

    # DCG: sum over rank positions
    dcg = 0.0
    for i, u in enumerate(rec_k):
        rel_i = 1.0 if u in relevant else 0.0
        if rel_i > 0:
            dcg += rel_i / math.log2(i + 2)  # log2(rank+1)

    # IDCG: best possible DCG if we ranked all relevant users at the top
    ideal_hits = min(len(relevant), k)
    idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_hits))

    if idcg == 0:
        return 0.0
    return dcg / idcg

## Choose which cold movies to evaluate

Use only movies with at least "min_pos" positive test ratings so metrics are more stable.

In [91]:
min_pos = 2  # minimum number of positive test ratings per movie used in eval

# Count how many positive test ratings each cold movie has
movie_pos_counts = positive_test.groupby("movieId").size()

# Keep only movies with >= min_pos positives
good_cold_movies = movie_pos_counts[movie_pos_counts >= min_pos].index

# Filter test_positive_movies down to those "good" cold movies
test_positive_movies = positive_test[positive_test["movieId"].isin(good_cold_movies)]

# The list of movieIds we will evaluate on
movie_ids_eval = sorted(test_positive_movies["movieId"].unique())
print("Cold movies used for eval (>= min_pos positives):", len(movie_ids_eval))
print("Total positive test interactions used:", test_positive_movies.shape[0])

Cold movies used for eval (>= min_pos positives): 48
Total positive test interactions used: 551


## Main evaluation loop: TF-IDF vs SBERT on cold-start items

In [92]:
K = 20  # top-K users to retrieve for each cold movie

results = {
    "tfidf_prec": [],
    "tfidf_rec": [],
    "tfidf_ndcg": [],
    "sbert_prec": [],
    "sbert_rec": [],
    "sbert_ndcg": []
}

# Loop over each cold movie in our evaluation set
for mid in tqdm(movie_ids_eval, desc="Evaluating cold-start movies"):
    # "Relevant" users = those who gave this movie rating >= 4 in test set
    rel_users = set(
        test_positive_movies[test_positive_movies["movieId"] == mid]["userId"].values
    )
    if len(rel_users) == 0:
        # Safety check; should not happen if we enforced min_pos >= 1
        continue

    row = movies_full.loc[mid]
    new_doc = row["tfidf_doc"]
    new_desc = row["sbert_desc"]

    # Get top-K users from both models
    tfidf_topk = find_target_users_for_new_item_tfidf(new_doc, k=K)
    sbert_topk = find_target_users_for_new_item_sbert(new_desc, k=K)

    # Extract only the userIds in ranking order
    tfidf_users_ranked = [u for (u, s) in tfidf_topk]
    sbert_users_ranked = [u for (u, s) in sbert_topk]

    # Compute metrics for this movie
    p_t = precision_at_k(tfidf_users_ranked, rel_users, K)
    r_t = recall_at_k(tfidf_users_ranked, rel_users, K)
    n_t = ndcg_at_k(tfidf_users_ranked, rel_users, K)

    p_s = precision_at_k(sbert_users_ranked, rel_users, K)
    r_s = recall_at_k(sbert_users_ranked, rel_users, K)
    n_s = ndcg_at_k(sbert_users_ranked, rel_users, K)

    # Save to results list
    results["tfidf_prec"].append(p_t)
    results["tfidf_rec"].append(r_t)
    results["tfidf_ndcg"].append(n_t)

    results["sbert_prec"].append(p_s)
    results["sbert_rec"].append(r_s)
    results["sbert_ndcg"].append(n_s)

Evaluating cold-start movies:   0%|          | 0/48 [00:00<?, ?it/s]

In [93]:
def avg(x):
    """Helper to safely get the mean (0.0 if list empty)."""
    return float(np.mean(x)) if len(x) > 0 else 0.0

print(f"Evaluated cold-start movies: {len(results['tfidf_prec'])}")

print("=== TF-IDF approach (release_year cold-start) ===")
print(f"Precision@{K}: {avg(results['tfidf_prec']):.4f}")
print(f"Recall@{K}   : {avg(results['tfidf_rec']):.4f}")
print(f"NDCG@{K}     : {avg(results['tfidf_ndcg']):.4f}")

print("=== SBERT approach (release_year cold-start) ===")
print(f"Precision@{K}: {avg(results['sbert_prec']):.4f}")
print(f"Recall@{K}   : {avg(results['sbert_rec']):.4f}")
print(f"NDCG@{K}     : {avg(results['sbert_ndcg']):.4f}")

Evaluated cold-start movies: 48
=== TF-IDF approach (release_year cold-start) ===
Precision@20: 0.0833
Recall@20   : 0.2117
NDCG@20     : 0.1487
=== SBERT approach (release_year cold-start) ===
Precision@20: 0.0229
Recall@20   : 0.0265
NDCG@20     : 0.0301
