In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# 1. Data Exploration and Preprocessing


In [2]:
# Load the dataset
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')
links = pd.read_csv('data/links.csv')
movies_metadata = pd.read_csv('data/movies_metadata.csv')
ratings = pd.read_csv('data/ratings.csv')
links_small = pd.read_csv('data/links_small.csv')

  movies_metadata = pd.read_csv('data/movies_metadata.csv')


## Analyzing Datasets

In [34]:
print("Movies Metadata:")
print(movies_metadata.info())
print(movies_metadata.head(), "\n")

Movies Metadata:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null

In [35]:
print("Credits:")
print(credits.info())
print(credits.head(), "\n")

Credits:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB
None
                                                cast  \
0  [{'cast_id': 14, 'character': 'Woody (voice)',...   
1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   
2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   
3  [{'cast_id': 1, 'character': "Savannah 'Vannah...   
4  [{'cast_id': 1, 'character': 'George Banks', '...   

                                                crew     id  
0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862  
1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844  
2  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602  
3  [{'credit_id': '52fe44779251416c91011acb', 'de...  31357  
4

In [36]:
print("Keywords:")
print(keywords.info())
print(keywords.head(), "\n")

Keywords:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB
None
      id                                           keywords
0    862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
1   8844  [{'id': 10090, 'name': 'board game'}, {'id': 1...
2  15602  [{'id': 1495, 'name': 'fishing'}, {'id': 12392...
3  31357  [{'id': 818, 'name': 'based on novel'}, {'id':...
4  11862  [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... 



In [37]:
print("Ratings:")
print(ratings.info())
print(ratings.head(), "\n")

Ratings:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
None
   userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
3       1     1221     5.0  1425941546
4       1     1246     5.0  1425941556 



## Data Preprocessing

### convert IDs to numeric values to ensure that they are consistent

In [38]:
# ensure that all IDs are numeric
credits['id']  = pd.to_numeric(credits['id'],  errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
links['tmdbId'] = pd.to_numeric(links['tmdbId'], errors='coerce')
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')

In [39]:
# ensure that all IDs are numeric
ratings['userId']  = pd.to_numeric(ratings['userId'],  errors='coerce')
ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')
ratings['rating']  = pd.to_numeric(ratings['rating'],  errors='coerce')

### drop rows with missing IDs

In [40]:
# drop rows with missing IDs
movies_metadata.dropna(subset=['id'], inplace=True)
credits.dropna(subset=['id'], inplace=True)
keywords.dropna(subset=['id'], inplace=True)
links.dropna(subset=['tmdbId'], inplace=True)
ratings.dropna(subset=['movieId'], inplace=True)

### drop rows with duplicate IDs


In [41]:
movies_metadata.drop_duplicates(subset='id', keep='first', inplace=True)
credits.drop_duplicates(subset='id', keep='first', inplace=True)
keywords.drop_duplicates(subset='id', keep='first', inplace=True)
links.drop_duplicates(subset='tmdbId', keep='first', inplace=True)
#a user might rate movie multiple times but we keep the first rating
ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first', inplace=True)


### preprocess movies metadata


In [42]:
def parse_jason(json_str):
    try:
        return ast.literal_eval(json_str) if pd.notnull(json_str) else None
    except ValueError:
        return None

##### drop columns with more than 50% missing values
since missing values are less than 1% we can drop them



In [43]:
#show missing percentage
missing_movies_metadata = movies_metadata.isnull().sum()/len(movies_metadata)*100
print(missing_movies_metadata)

adult                     0.000000
belongs_to_collection    90.121718
budget                    0.000000
genres                    0.000000
homepage                 82.889089
id                        0.000000
imdb_id                   0.037418
original_language         0.024211
original_title            0.000000
overview                  2.099795
popularity                0.006603
poster_path               0.849603
production_companies      0.006603
production_countries      0.006603
release_date              0.191491
revenue                   0.006603
runtime                   0.572271
spoken_languages          0.006603
status                    0.184888
tagline                  55.096516
title                     0.006603
video                     0.006603
vote_average              0.006603
vote_count                0.006603
dtype: float64


In [44]:
movies_metadata.drop(columns=['homepage', 'belongs_to_collection','tagline'], inplace=True)
movies_metadata.dropna(inplace=True)

#### handle json columns


In [45]:
json_columns = ["genres", "spoken_languages", "production_companies", "production_countries"]

for column in json_columns:
    movies_metadata[column] = movies_metadata[column].apply(
        lambda x: [g["name"] for g in parse_jason(x)] if isinstance(parse_jason(x), list) else []
    )

#### ensure correct data types

In [46]:
movies_metadata['budget'] = pd.to_numeric(movies_metadata['budget'], errors='coerce')
movies_metadata['revenue'] = pd.to_numeric(movies_metadata['revenue'], errors='coerce')
movies_metadata['popularity'] = pd.to_numeric(movies_metadata['popularity'], errors='coerce')
movies_metadata['vote_average'] = pd.to_numeric(movies_metadata['vote_average'], errors='coerce')
movies_metadata['vote_count'] = pd.to_numeric(movies_metadata['vote_count'], errors='coerce')
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')
movies_metadata['runtime'] = pd.to_numeric(movies_metadata['runtime'], errors='coerce')

#### save the preprocessed data to csv

In [47]:
movies_metadata.to_csv('movies_metadata_preprocessed.csv', index=False)

### preprocess credits

#### there is no missing values in the credits dataset

In [48]:
missing_percentage = credits.isnull().sum() / len(credits) * 100
print(missing_percentage)

cast    0.0
crew    0.0
id      0.0
dtype: float64


#### handle json columns

In [49]:
json_columns = ["cast", "crew"]
for column in json_columns:
    credits[column] = credits[column].apply(
        lambda x: [g["name"] for g in parse_jason(x)] if isinstance(parse_jason(x), list) else []
    )

In [50]:
credits.to_csv("cleaned_movie_credits.csv", index=False)
credits.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",11862


### preprocess keywords


#### there is no missing values in the keywords dataset


In [51]:
missing_percentage = keywords.isnull().sum() / len(keywords) * 100
print(missing_percentage)

id          0.0
keywords    0.0
dtype: float64


#### handle json columns

In [52]:
json_columns = ["keywords"]
for column in json_columns:
    keywords[column] = keywords[column].apply(
        lambda x: [g["name"] for g in parse_jason(x)] if isinstance(parse_jason(x), list) else []
    )


In [53]:
keywords.to_csv("cleaned_movie_keywords.csv", index=False)
keywords.head()

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


### preprocess links
#### links are clean so no change is needed

In [56]:
missing_percentage = links.isnull().sum() / len(links) * 100
print(missing_percentage)

movieId    0.0
imdbId     0.0
tmdbId     0.0
dtype: float64


### preprocess ratings


In [57]:
missing_percentage = ratings.isnull().sum() / len(ratings) * 100
print(missing_percentage)

userId       0.0
movieId      0.0
rating       0.0
timestamp    0.0
dtype: float64


#### ensure correct data types

In [58]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')
ratings['userId'] = pd.to_numeric(ratings['userId'], errors='coerce')
ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')

In [59]:
ratings.to_csv("cleaned_ratings.csv", index=False)

# 2. Implement Recommender System

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import torch_directml



In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x1709d522dd0>

## Load and prepare the preprocessed data

### Load the preprocessed data

In [4]:
ratings_df = pd.read_csv("data/ratings_small.csv")
movies_metadata_df = pd.read_csv("movies_metadata_preprocessed.csv")
keywords_df = pd.read_csv("cleaned_movie_keywords.csv")
credits_df = pd.read_csv("cleaned_movie_credits.csv")

### Prepare the data
#### for computational efficiency, we will consider ratings more than 2 = 1 and under that zero

In [5]:
ratings_df['implicit'] = (ratings_df['rating'] >= 3).astype(int)
ratings_df.sort_values(by=['userId', 'timestamp'], inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

### merge dataframes
#### change id to movieId in datasets for merging

In [6]:
keywords_df.rename(columns={'id':'movieId'}, inplace=True)
credits_df.rename(columns={'id':'movieId'}, inplace=True)
movies_metadata_df.rename(columns={'id':'movieId'}, inplace=True)


#### merge the datasets

In [7]:
movie_content_df = movies_metadata_df.merge(keywords_df, on='movieId', how='left')
movie_content_df = movie_content_df.merge(credits_df, on='movieId', how='left')


## Build User-Sequence Documents

### build user index mappings

In [8]:
unique_users = ratings_df['userId'].unique()
user2index = {u: i for i, u in enumerate(unique_users)}
index2user = {i: u for u, i in user2index.items()}
num_users = len(unique_users)

unique_items = ratings_df['movieId'].unique()
item2index = {m: i for i, m in enumerate(unique_items)}
index2item = {i: m for m, i in item2index.items()}
num_items = len(unique_items)

### build user sequences

#### make user sequences

In [9]:
user_sequences = {}
for row in ratings_df.itertuples():
    u = getattr(row, 'userId')
    i = getattr(row, 'movieId')
    user_sequences.setdefault(u, []).append(str(i))

tagged_docs = []
for u, item_list in user_sequences.items():
        tagged_docs.append(TaggedDocument(words=item_list, tags=[str(u)]))  # for each user interaction


### Train Doc2Vec Model for user sequences

In [10]:
d2v_vector_size = 40
d2v_window = 10
d2v_model = Doc2Vec(
    documents=tagged_docs,
    vector_size=d2v_vector_size,
    window=d2v_window,
    min_count=1,
    dm=0,
    epochs=10,
    workers=4
)

### Build user docvec and item wordvec documents

In [11]:
user_docvec_map = {}
for u in unique_users:
    user_docvec_map[u] = d2v_model.dv[str(u)]

item_wordvec_map = {}
for m in unique_items:
    if str(m) in d2v_model.wv:
        item_wordvec_map[m] = d2v_model.wv[str(m)]
    else:
        item_wordvec_map[m] = np.zeros(d2v_vector_size)


## Build item-content documents

In [12]:
def list_to_str(lst):
    if not isinstance(lst, list):
        return ""
    return " ".join(str(x) for x in lst)
safe_str = lambda x: x if pd.notnull(x) else ""

### Preprocess and combine fields


In [13]:
item_content_docs = []
for row in movie_content_df.itertuples(index=False):
    mid = row.movieId
    overview_text = safe_str(row.overview)

    keywords_text = list_to_str(eval(row.keywords)) if isinstance(row.keywords, str) else list_to_str(row.keywords)
    cast_text = list_to_str(eval(row.cast)) if isinstance(row.cast, str) else list_to_str(row.cast)
    crew_text = list_to_str(eval(row.crew)) if isinstance(row.crew, str) else list_to_str(row.crew)
    # merge them all
    combined_content = (
        overview_text + " " +
        keywords_text + " " +
        cast_text + " " +
        crew_text
    )
    item_content_docs.append(TaggedDocument(
        words=combined_content.lower().split(),
        tags=[f"ITEM_{int(mid)}"]
    ))


### Train Doc2Vec model for item content

In [14]:
content_vector_size = 40
item_content_model = Doc2Vec(
    documents=item_content_docs,
    vector_size=content_vector_size,
    window=10,
    min_count=1,
    dm=0,
    epochs=10,
    workers=4
)

item_content_map = {}
for row in movie_content_df.itertuples(index=False):
    mid = row.movieId
    # if it didn't appear in doc2vec, default to zeros
    tag_id = f"ITEM_{int(mid)}"
    if tag_id in item_content_model.dv:
        item_content_map[mid] = item_content_model.dv[tag_id]
    else:
        item_content_map[mid] = np.zeros(content_vector_size)


## Build the DSER Model
### Define GMF

In [15]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=8):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        i = self.item_emb(item_idx)
        return u * i

### Define MLP

In [16]:
class MLP(nn.Module):
    def __init__(self, input_dim=120, layers=[64,32,16]):
        super(MLP, self).__init__()
        seq = []
        prev_dim = input_dim
        for layer_size in layers:
            seq.append(nn.Linear(prev_dim, layer_size))
            seq.append(nn.ReLU())
            prev_dim = layer_size
        self.mlp = nn.Sequential(*seq)

    def forward(self, user_doc, item_word, item_content):
        x = torch.cat((user_doc, item_word, item_content), dim=1)
        out = self.mlp(x)
        return out

### Define DSER

In [17]:
class DSER(nn.Module):
    def __init__(self,
                 num_users,
                 num_items,
                 gmf_emb_size=8,
                 mlp_input_dim=120,
                 mlp_layers=[64, 32, 16]):
        super(DSER, self).__init__()
        self.gmf = GMF(num_users, num_items, emb_size=gmf_emb_size)
        self.mlp = MLP(input_dim=mlp_input_dim, layers=mlp_layers)

        final_dim = gmf_emb_size + mlp_layers[-1]
        self.out = nn.Linear(final_dim, 1)
        nn.init.xavier_uniform_(self.out.weight)

    def forward(self,
                user_idx, item_idx,
                user_docvec, item_wordvec, item_contentvec):
        # GMF part
        gmf_vec = self.gmf(user_idx, item_idx)
        # MLP part
        mlp_vec = self.mlp(user_docvec, item_wordvec, item_contentvec)

        # Fusion
        concat = torch.cat([gmf_vec, mlp_vec], dim=1)
        logit = self.out(concat).squeeze(-1)
        return torch.sigmoid(logit)


### Train/Test split with navigate sampeling

#### shuffle and split the datas 80/20

In [18]:
all_data = ratings_df[['userId','movieId','rating']].drop_duplicates()
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(0.8 * len(all_data))
train_df = all_data.iloc[:split_idx]
test_df  = all_data.iloc[split_idx:]


In [19]:
all_ui_set = set(zip(all_data['userId'], all_data['movieId']))

len_ui_set = len(all_ui_set)
print("Size of all_ui_set:", len_ui_set)

Size of all_ui_set: 100004


#### negative sampling
#### store all user-item pairs in a set to avoid duplicating

In [20]:
all_ui_set = set(zip(all_data['userId'], all_data['movieId']))
train_pos = train_df[train_df['rating']==1]

#### create negative samples
#### sample 4 negative samples for each positive sample

In [21]:
train_instances = []
neg_ratio = 4
for row in train_pos.itertuples(index=False):
    u = row.userId
    m = row.movieId
    # positive samples
    train_instances.append((u, m, 1))
    # negative samples
    for _ in range(neg_ratio):
        neg_item = random.choice(unique_items)
        while (u, neg_item) in all_ui_set:
            neg_item = random.choice(unique_items)
        train_instances.append((u, neg_item, 0))

train_np = np.array(train_instances, dtype=np.int64)

### Set the gpu device
#### if your device has additional graphic, better switch to it

In [22]:
num_gpus = torch_directml.device_count()
for i in range(num_gpus):
    print(f"GPU {i}: {torch_directml.device_name(i)}")

GPU 0: AMD Radeon RX 6800S 
GPU 1: AMD Radeon(TM) Graphics 


In [23]:
device = torch_directml.device(0)

### train DSER Model
#### Hyperparameters

In [24]:
gmf_emb_size = 8
mlp_input_dim = 40 + 40 + 40
mlp_layers = [64, 32, 16]

model = DSER(num_users=num_users,
             num_items=num_items,
             gmf_emb_size=gmf_emb_size,
             mlp_input_dim=mlp_input_dim,
             mlp_layers=mlp_layers).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()



### Training Loop

In [25]:
EPOCHS = 10
BATCH_SIZE = 512

for ep in range(EPOCHS):
    np.random.shuffle(train_np)
    epoch_loss = 0.0
    model.train()

    for start in range(0, len(train_np), BATCH_SIZE):
        end = min(start + BATCH_SIZE, len(train_np))
        batch = train_np[start:end]

        user_list = batch[:, 0]
        item_list = batch[:, 1]
        label_list = batch[:, 2]

        # make torch Tensors
        user_idx = torch.LongTensor([user2index[u] for u in user_list]).to(device)
        item_idx = torch.LongTensor([item2index[i] for i in item_list]).to(device)
        y_batch  = torch.FloatTensor(label_list).to(device)

        # build doc2vec input
        user_vecs = [user_docvec_map[u] for u in user_list]
        item_seq_vecs = [item_wordvec_map[i] for i in item_list]
        default_content_vec = np.zeros(content_vector_size, dtype=np.float32)
        item_cnt_vecs = []
        for i in item_list:
            if i in item_content_map:
                item_cnt_vecs.append(item_content_map[i])
            else:
                item_cnt_vecs.append(default_content_vec)

        user_vec_t = torch.FloatTensor(user_vecs).to(device)
        item_seq_t = torch.FloatTensor(item_seq_vecs).to(device)
        item_cnt_t = torch.FloatTensor(item_cnt_vecs).to(device)

        optimizer.zero_grad()
        predictions = model(user_idx, item_idx, user_vec_t, item_seq_t, item_cnt_t)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {ep+1}/{EPOCHS}, Loss = {epoch_loss:.4f}")


  user_vec_t = torch.FloatTensor(user_vecs).to(device)
  return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)


Epoch 1/10, Loss = 13.6784
Epoch 2/10, Loss = 12.5869
Epoch 3/10, Loss = 10.9542
Epoch 4/10, Loss = 6.7213
Epoch 5/10, Loss = 2.9933
Epoch 6/10, Loss = 1.4208
Epoch 7/10, Loss = 0.7933
Epoch 8/10, Loss = 0.4954
Epoch 9/10, Loss = 0.3356
Epoch 10/10, Loss = 0.2405


### Recommend for user function

In [26]:
def recommend_for_user(user_id, topK=5):
    model.eval()
    if user_id not in user_docvec_map:
        print(f"No user embedding for {user_id}. Returning empty.")
        return []

    user_idx = user2index[user_id]
    user_doc = user_docvec_map[user_id]
    user_doc_t = torch.FloatTensor(user_doc.copy()).unsqueeze(0).to(device)

    default_content_vec = np.zeros(content_vector_size, dtype=np.float32)

    scores = []
    for it in unique_items:
        it_idx = item2index[it]

        item_seq_vec = item_wordvec_map[it]
        if it in item_content_map:
            item_cnt_vec = item_content_map[it]
        else:
            item_cnt_vec = default_content_vec

        u_batch = torch.LongTensor([user_idx]).to(device)
        i_batch = torch.LongTensor([it_idx]).to(device)

        item_seq_t = torch.FloatTensor(item_seq_vec.copy()).unsqueeze(0).to(device)
        item_cnt_t = torch.FloatTensor(item_cnt_vec.copy()).unsqueeze(0).to(device)

        with torch.no_grad():
            score = model(u_batch, i_batch, user_doc_t, item_seq_t, item_cnt_t).item()

        scores.append((it, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:topK]




In [28]:
id2title = dict(zip(movies_metadata_df['movieId'], movies_metadata_df['title']))
some_user_id = unique_users[5]
print(f"Top-5 items for user {some_user_id}:")
top_items = recommend_for_user(some_user_id, topK=5)
for movie_id, s in top_items:
    movie_title = id2title.get(movie_id, "Unknown Title")
    print(f"MovieID={movie_id} | Title={movie_title} | Score={s:.4f}")

Top-5 items for user 6:
MovieID=1562 | Title=28 Weeks Later | Score=0.9989
MovieID=3977 | Title=Unknown Title | Score=0.9969
MovieID=2710 | Title=Unknown Title | Score=0.9959
MovieID=1917 | Title=Who Killed Bambi? | Score=0.9950
MovieID=19 | Title=Metropolis | Score=0.9941


# Evaluate Recommender System

In [31]:
import math


def evaluate_recommender(model, device,
                  test_df, user2index, item2index,
                  user_docvec_map, item_wordvec_map, item_content_map,
                  K=10,
                  content_vector_size=40):
    model.eval()
    all_ui_set = set(zip(test_df['userId'], test_df['movieId']))

    hits_sum = 0.0
    ndcgs_sum= 0.0
    total_count = 0
    for row in test_df.itertuples(index=False):
        u, item, implicit_val = row.userId, row.movieId, row.implicit
        if implicit_val != 1:
            continue

        total_count += 1
        neg_candidates = []
        tries = 0
        while len(neg_candidates) < 19 and tries<1000:
            neg_item = random.choice(list(item2index.keys()))
            if neg_item == item:
                tries +=1
                continue
            if (u, neg_item) not in all_ui_set:
                neg_candidates.append(neg_item)
            tries +=1

        test_items = [item] + neg_candidates

        user_idx = torch.LongTensor([user2index[u]]).to(device)
        user_doc = user_docvec_map[u].copy()

        user_doc_t = torch.FloatTensor(user_doc).unsqueeze(0).to(device)

        scores_list = []
        for itm in test_items:
            i_idx = torch.LongTensor([item2index[itm]]).to(device)

            seq_vec = item_wordvec_map[itm].copy()
            item_seq_t = torch.FloatTensor(seq_vec).unsqueeze(0).to(device)

            if itm in item_content_map:
                cnt_vec = item_content_map[itm].copy()
            else:
                cnt_vec = np.zeros(content_vector_size, dtype=np.float32)
            item_cnt_t = torch.FloatTensor(cnt_vec).unsqueeze(0).to(device)

            with torch.no_grad():
                score = model(user_idx, i_idx,
                              user_doc_t, item_seq_t, item_cnt_t).item()

            scores_list.append((itm, score))

        scores_list.sort(key=lambda x:x[1], reverse=True)

        rank = 0
        for idx,(itm, sc) in enumerate(scores_list):
            if itm == item:
                rank = idx
                break


        if rank < K:
            hits_sum += 1.0
            ndcgs_sum += 1.0 / math.log2(rank+2)
    hr = hits_sum / total_count if total_count>0 else 0
    ndcg = ndcgs_sum / total_count if total_count>0 else 0
    return hr, ndcg

In [32]:
test_df = test_df.copy()
if 'rating' in test_df.columns and 'implicit' not in test_df.columns:
    test_df['implicit'] = (test_df['rating'] >= 3).astype(int)

hr10, ndcg10 = evaluate_recommender(
    model=model,
    device=device,
    test_df=test_df,
    user2index=user2index,
    item2index=item2index,
    user_docvec_map=user_docvec_map,
    item_wordvec_map=item_wordvec_map,
    item_content_map=item_content_map,
    K=10,  # Evaluate top-10
    content_vector_size=40
)

print(f"HR@10={hr10:.4f}, NDCG@10={ndcg10:.4f}")

HR@10=0.5816, NDCG@10=0.2934
