In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [8]:
movies = pd.read_csv('data/merge_data_updated_movies.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62316 non-null  float64
 1   title    62423 non-null  object 
 2   genres   62423 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


In [9]:
# Check for missing values in 'movieId' column
missing_values = movies['movieId'].isna().sum()

# Check for infinite values in 'movieId' column
infinite_values = movies['movieId'].isin([np.inf, -np.inf]).sum()

print("Number of missing values in 'movieId' column:", missing_values)
print("Number of infinite values in 'movieId' column:", infinite_values)

Number of missing values in 'movieId' column: 107
Number of infinite values in 'movieId' column: 0


In [10]:
movies = movies.dropna(subset=['movieId'])  # Drop rows with missing 'movieId'
movies = movies.replace([np.inf, -np.inf], np.nan).dropna(subset=['movieId'])  # Drop rows with infinite 'movieId'

In [11]:
# Check for missing values in 'movieId' column
missing_values = movies['movieId'].isna().sum()

# Check for infinite values in 'movieId' column
infinite_values = movies['movieId'].isin([np.inf, -np.inf]).sum()

print("Number of missing values in 'movieId' column:", missing_values)
print("Number of infinite values in 'movieId' column:", infinite_values)

Number of missing values in 'movieId' column: 0
Number of infinite values in 'movieId' column: 0


In [3]:
ratings = pd.read_csv('data/ratings_7jt.csv', parse_dates=['timestamp'])
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7510219 entries, 0 to 7510218
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 229.2+ MB


  ratings = pd.read_csv('data/ratings_7jt.csv', parse_dates=['timestamp'])


In [4]:
# Check for missing values in 'movieId' column
missing_values = ratings['movieId'].isna().sum()

# Check for infinite values in 'movieId' column
infinite_values = ratings['movieId'].isin([np.inf, -np.inf]).sum()

print("Number of missing values in 'movieId' column:", missing_values)
print("Number of infinite values in 'movieId' column:", infinite_values)

Number of missing values in 'movieId' column: 0
Number of infinite values in 'movieId' column: 0


In [5]:
ratings['userId'] = ratings['userId'].astype(str)
a = ratings['userId'][0]
ratings['movieId'] = ratings['movieId'].astype(int)
b = ratings['movieId'][0]
print(type(a))
print(type(b))

<class 'str'>
<class 'numpy.int32'>


In [6]:
user_ids = ratings["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

print(user2user_encoded)
print(userencoded2user)

ratings["user"] = ratings["userId"].map(user2user_encoded)

{'1': 0, '3': 1, '7': 2, '14': 3, '24': 4, '26': 5, '27': 6, '29': 7, '32': 8, '33': 9, '41': 10, '45': 11, '46': 12, '48': 13, '49': 14, '50': 15, '55': 16, '58': 17, '59': 18, '65': 19, '67': 20, '73': 21, '74': 22, '77': 23, '80': 24, '81': 25, '82': 26, '86': 27, '88': 28, '93': 29, '100': 30, '101': 31, '104': 32, '105': 33, '106': 34, '110': 35, '114': 36, '118': 37, '119': 38, '120': 39, '124': 40, '125': 41, '129': 42, '131': 43, '137': 44, '140': 45, '142': 46, '143': 47, '144': 48, '147': 49, '150': 50, '152': 51, '158': 52, '162': 53, '164': 54, '168': 55, '170': 56, '177': 57, '178': 58, '182': 59, '188': 60, '189': 61, '193': 62, '196': 63, '203': 64, '210': 65, '214': 66, '219': 67, '225': 68, '230': 69, '237': 70, '239': 71, '242': 72, '243': 73, '249': 74, '251': 75, '252': 76, '258': 77, '262': 78, '263': 79, '264': 80, '269': 81, '270': 82, '271': 83, '273': 84, '276': 85, '278': 86, '281': 87, '282': 88, '288': 89, '290': 90, '291': 91, '292': 92, '298': 93, '301': 9

In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from tqdm import tqdm


# Randomly select 30% of the users
# rand_userIds = np.random.choice(ratings['user'].unique(), 
#                                 size=int(len(ratings['user'].unique())*0.3), 
#                                 replace=False)
# ratings = ratings.loc[ratings['user'].isin(rand_userIds)]

# print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

ratings['rank_latest'] = ratings.groupby(['user'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# Drop columns that we no longer need
train_ratings = train_ratings[['user', 'movieId', 'rating']]
test_ratings = test_ratings[['user', 'movieId', 'rating']]

train_ratings.loc[:, 'rating'] = 1

# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['user'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=0)  # Set num_workers to 0

num_users = ratings['user'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

trainer = pl.Trainer(
    max_epochs=5, 
    devices=1, 
    reload_dataloaders_every_n_epochs=1,  # This replaces reload_dataloaders_every_epoch
    enable_progress_bar=True,  # Replaces progress_bar_refresh_rate
    logger=False, 
    enable_checkpointing=False  # Replaces checkpoint_callback
)

trainer.fit(model)

In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

In [None]:
user_id = '7'

user_encoder = user2user_encoded.get(user_id)

print(user_encoder)

In [None]:
print(user_interacted_items[user_encoder])

In [None]:
user_id = '7'

user_encoder = user2user_encoded.get(user_id)

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

interacted_items = user_interacted_items[user_encoder]
not_interacted_items = set(all_movieIds) - set(interacted_items)
selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
test_items = selected_not_interacted + [710]
    
predicted_labels = np.squeeze(model(torch.tensor([user_encoder]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
print(top10_items)

In [None]:
list_id = top10_items

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

In [None]:
list_id = user_interacted_items[user_encoder]

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

In [None]:
torch.save(model, "model/model_fix_7m.pth")
torch.save(model.state_dict(), 'model/model_fix_7m_state_dict.pth')

In [None]:
model_state_dict_7m = NCF(num_users, num_items, train_ratings, all_movieIds)
model_state_dict_7m.load_state_dict(torch.load('model/model_fix_7m_state_dict.pth'))
full_model_7m = torch.load('model/model_fix_7m.pth')

full_model_7m.load_state_dict(model_state_dict_7m.state_dict())

In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(full_model_7m(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

In [None]:
user_id = '7'

user_encoder = user2user_encoded.get(user_id)

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

interacted_items = user_interacted_items[user_encoder]
not_interacted_items = set(all_movieIds)- set(interacted_items)
selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
test_items = selected_not_interacted + [5]
    
predicted_labels = np.squeeze(full_model_7m(torch.tensor([user_encoder]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
print(top10_items)

In [None]:
print(user_interacted_items[user_encoder])

In [None]:
print(interacted_items)

In [None]:
list_id = top10_items

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

In [None]:
list_id = user_interacted_items[user_encoder]

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

In [None]:
test_user_item_set = set(zip(test_ratings['user'], test_ratings['movieId']))

print(test_user_item_set)

In [14]:
movie_ids = ratings["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
ratings["movie"] = ratings["movieId"].map(movie2movie_encoded)

In [15]:
print(movie2movie_encoded)

{680: 0, 110: 1, 108: 2, 11902: 3, 872: 4, 88: 5, 892: 6, 11645: 7, 490: 8, 826: 9, 832: 10, 782: 11, 165: 12, 196: 13, 5961: 14, 34584: 15, 19426: 16, 65749: 17, 8785: 18, 104: 19, 1075: 20, 801: 21, 452: 22, 641: 23, 843: 24, 824: 25, 41050: 26, 10238: 27, 41951: 28, 194: 29, 614: 30, 1791: 31, 21925: 32, 31407: 33, 64: 34, 22257: 35, 121: 36, 598: 37, 1555: 38, 12: 39, 22: 40, 153: 41, 11042: 42, 778: 43, 405: 44, 615: 45, 338: 46, 797: 47, 38: 48, 1720: 49, 11656: 50, 11506: 51, 29455: 52, 11602: 53, 29453: 54, 113: 55, 439: 56, 870: 57, 809: 58, 18333: 59, 62395: 60, 119623: 61, 46326: 62, 1653: 63, 140: 64, 2332: 65, 844: 66, 2841: 67, 4974: 68, 11399: 69, 862: 70, 902: 71, 63: 72, 629: 73, 103: 74, 9886: 75, 9482: 76, 19155: 77, 11: 78, 101: 79, 278: 80, 13: 81, 9739: 82, 329: 83, 424: 84, 78: 85, 280: 86, 274: 87, 9323: 88, 532: 89, 627: 90, 602: 91, 238: 92, 289: 93, 963: 94, 62: 95, 500: 96, 2756: 97, 531: 98, 1891: 99, 85: 100, 679: 101, 429: 102, 185: 103, 28: 104, 1892: 10

In [16]:
import os
import requests
import pickle

folder = './model'
if not os.path.exists(folder):
  os.makedirs(folder)

In [None]:
pickle.dump(ratings, open(folder + '/ratings_7m_df.pkl', 'wb'))
pickle.dump(user2user_encoded, open(folder + '/user_id_encoded.pkl', 'wb'))
pickle.dump(movie2movie_encoded, open(folder + '/movie_id_encoded.pkl', 'wb'))

In [12]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=0)  # Set num_workers to 0

In [20]:
folder = './model/'

ratings = pickle.load(open(folder + 'ratings_7m_df.pkl', 'rb'))
user_id_encoded = pickle.load(open(folder + 'user_id_encoded.pkl', 'rb'))
movie_id_encoded = pickle.load(open(folder + 'movie_id_encoded.pkl', 'rb'))

In [21]:
num_users = ratings['user'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()

ratings['rank_latest'] = ratings.groupby(['user'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]

# Drop columns that we no longer need
train_ratings = train_ratings[['user', 'movieId', 'rating']]

train_ratings.loc[:, 'rating'] = 1

In [22]:
model_state_dict_7m = NCF(num_users, num_items, train_ratings, all_movieIds)
model_state_dict_7m.load_state_dict(torch.load('model/model_fix_7m_state_dict.pth'))
full_model_7m = torch.load('model/model_fix_7m.pth')

full_model_7m.load_state_dict(model_state_dict_7m.state_dict())

<All keys matched successfully>

In [23]:
test_ratings = ratings[ratings['rank_latest'] == 1]
test_ratings = test_ratings[['user', 'movieId', 'rating']]

In [24]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(full_model_7m(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/48762 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.95


In [25]:
user_id = '7'

user_encoder = user2user_encoded.get(user_id)

print(user_encoder)

2


In [26]:
print(user_interacted_items[user_encoder])

[710, 4584, 17015, 11010, 568, 414, 1572, 1642, 10451, 18183, 241, 680, 110, 108, 109, 193, 3049, 2758, 5503, 424, 812, 581, 268, 274, 10020]


In [28]:
user_id = '7'

user_encoder = user2user_encoded.get(user_id)

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

interacted_items = user_interacted_items[user_encoder]
not_interacted_items = set(all_movieIds) - set(interacted_items)
selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
test_items = selected_not_interacted + [710]
    
predicted_labels = np.squeeze(full_model_7m(torch.tensor([user_encoder]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
print(top10_items)

[710, 9800, 11040, 692, 28628, 47500, 21801, 32020, 36561, 40096]


In [29]:
list_id = top10_items

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

MovieID: 710, Title: GoldenEye (1995)
MovieID: 9800, Title: Philadelphia (1993)
MovieID: 11040, Title: Little Big Man (1970)
MovieID: 692, Title: Pink Flamingos (1972)
MovieID: 28628, Title: Stonewall (1995)
MovieID: 47500, Title: Pipe Dream (2002)
MovieID: 21801, Title: Slaughter Rule, The (2002)
MovieID: 32020, Title: Foxes (1980)
MovieID: 36561, Title: I Am Dina (2002)
MovieID: 40096, Title: O Auto da Compadecida (Dog's Will, A) (2000)


In [30]:
list_id = user_interacted_items[user_encoder]

for movieId in list_id:
    movie_title_series = movies.loc[movies['movieId'] == movieId, 'title']
    if not movie_title_series.empty:
        movie_title = movie_title_series.iloc[0]
        print(f"MovieID: {movieId}, Title: {movie_title}")
    else:
        print(f"MovieID: {movieId} not found in the movies DataFrame.")

MovieID: 710, Title: GoldenEye (1995)
MovieID: 4584, Title: Sense and Sensibility (1995)
MovieID: 17015, Title: Persuasion (1995)
MovieID: 11010, Title: Postman, The (Postino, Il) (1994)
MovieID: 568, Title: Apollo 13 (1995)
MovieID: 414, Title: Batman Forever (1995)
MovieID: 1572, Title: Die Hard: With a Vengeance (1995)
MovieID: 1642, Title: Net, The (1995)
MovieID: 10451, Title: Eat Drink Man Woman (Yin shi nan nu) (1994)
MovieID: 18183, Title: Like Water for Chocolate (Como agua para chocolate) (1992)
MovieID: 241, Title: Natural Born Killers (1994)
MovieID: 680, Title: Pulp Fiction (1994)
MovieID: 110, Title: Three Colors: Red (Trois couleurs: Rouge) (1994)
MovieID: 108, Title: Three Colors: Blue (Trois couleurs: Bleu) (1993)
MovieID: 109, Title: Three Colors: White (Trzy kolory: Bialy) (1994)
MovieID: 193, Title: Star Trek: Generations (1994)
MovieID: 3049, Title: Ace Ventura: Pet Detective (1994)
MovieID: 2758, Title: Addams Family Values (1993)
MovieID: 5503, Title: Fugitive, T

In [32]:
model_state_dict_7m = NCF(num_users, num_items, train_ratings, all_movieIds)
model_state_dict_7m.load_state_dict(torch.load('model/model_fix_7m_state_dict.pth'))

<All keys matched successfully>

In [33]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model_state_dict_7m(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/48762 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.95
