In [None]:
import pandas as pd

In [None]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head()
del tags['timestamp']

row_0 = tags.iloc[0]
type(row_0)
print(row_0)

In [None]:
tags.isnull().any().any()
tags=tags.dropna()
tags.isnull().any().any()

In [None]:
t = movies.merge(tags, on='movieId', how='inner')
t.head()

In [None]:
avg_ratings= ratings.groupby('movieId', as_index=False).mean()
del avg_ratings['userId']
avg_ratings.head()

box_office = movies.merge(avg_ratings, on='movieId', how='inner')
box_office.tail()

is_highly_rated = box_office['rating'] >= 4.0
box_office[is_highly_rated][-5:]

In [None]:
movie_genres = movies['genres'].str.split('|', expand=True)
movie_genres['isComedy'] = movies['genres'].str.contains('Comedy')
movies['year'] = movies['title'].str.extract('.*\((.*)\).*', expand=True)

Train-test split
Along with the rating, there is also a timestamp column that shows the date and time the review was submitted. Using the timestamp column, we will implement our train-test split strategy using the leave-one-out methodology. For each user, the most recent review is used as the test set (i.e. leave one out), while the rest will be used as training data .

To illustrate this, the movies reviewed by user 39,849 is shown below. The last movie reviewed by the user is the 2014 hit movie Guardians of The Galaxy. We'll use this movie as the testing data for this user, and use the rest of the reviewed movies as training data.

In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

Converting the dataset into an implicit feedback dataset
As discussed earlier, we will train a recommender system using implicit feedback. However, the MovieLens dataset that we're using is based on explicit feedback. To convert this dataset into an implicit feedback dataset, we'll simply binarize the ratings such that they are are '1' (i.e. positive class). The value of '1' represents that the user has interacted with the item.

It is important to note that using implicit feedback reframes the problem that our recommender is trying to solve. Instead of trying to predict movie ratings (when using explicit feedback), we are trying to predict whether the user will interact (i.e. click/buy/watch) with each movie, with the aim of presenting to users the movies with the highest interaction likelihood.

We do have a problem now though. After binarizing our dataset, we see that every sample in the dataset now belongs to the positive class. However we also require negative samples to train our models, to indicate movies that the user has not interacted with. We assume that such movies are those that the user are not interested in - even though this is a sweeping assumption that may not be true, it usually works out rather well in practice.

The code below generates 4 negative samples for each row of data. In other words, the ratio of negative to positive samples is 4:1. This ratio is chosen arbitrarily but I found that it works rather well (feel free to find the best ratio yourself!)

In [None]:
import tqdm
train_ratings.loc[:, 'rating'] = 1

# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

Neural Collaborative Filtering 

The key here is that we don't need the user to interact on every single item in the list of recommendations. Instead, we just need the user to interact with at least one item on the list - as long as the user does that, the recommendations have worked.

To simulate this, let's run the following evaluation protocol to generate a list of 10 recommended items for each user.

For each user, randomly select 99 items that the user has not interacted with
Combine these 99 items with the test item (the actual item that the user interacted with). We now have 100 items.
Run the model on these 100 items, and rank them according to their predicted probabilities
Select the top 10 items from the list of 100 items. If the test item is present within the top 10 items, then we say that this is a hit.
Repeat the process for all users. The Hit Ratio is then the average hits.
This evaluation protocol is known as Hit Ratio @ 10, and it is commonly used to evaluate recommender systems.

In [None]:
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

---

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


In [2]:

ratings = pd.read_csv('ml-latest-small/ratings.csv')
#ratings = pd.read_csv('ml-20m/ratings.csv')
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

train_ratings.loc[:, 'rating'] = 1

In [3]:
# Get a list of all movie IDs
all_movieIds = torch.tensor(ratings['movieId'].unique(), device='cuda')

# Convert train_ratings to PyTorch tensors
user_item_pairs = torch.tensor(    list(zip(train_ratings['userId'], train_ratings['movieId'])), device='cuda')

# Placeholders for training data
users, items, labels = [], [], []

# Create a dictionary for user-item interactions
user_item_dict = train_ratings.groupby('userId')['movieId'].apply(set).to_dict()

# Convert user-item dictionary to GPU
user_item_dict = {k: torch.tensor(list(v), device='cuda') for k, v in user_item_dict.items()}

# Negative to positive ratio
num_negatives = 4

# Start sampling
for (u, i) in tqdm(user_item_pairs):
    users.append(u.item())
    items.append(i.item())
    labels.append(1)  # Positive label for interacted items

    # Get user's non-interacted items
    not_interacted_items = torch.tensor(
        list(set(all_movieIds.tolist()) - set(user_item_dict[u.item()].tolist())), device='cuda'
    )

    # Randomly sample negative items
    negative_samples = not_interacted_items[
        torch.randint(0, len(not_interacted_items), (num_negatives,))
    ]

    users.extend([u.item()] * num_negatives)
    items.extend(negative_samples.tolist())
    labels.extend([0] * num_negatives)

# Convert results to PyTorch tensors
users = torch.tensor(users, device='cuda')
items = torch.tensor(items, device='cuda')
labels = torch.tensor(labels, device='cuda')


  0%|          | 0/100226 [00:00<?, ?it/s]

In [9]:
# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/100226 [00:00<?, ?it/s]

In [10]:
all_movieIds.shape

(9724,)

In [11]:
import ast
semantic_embedding = pd.read_csv('output_50.csv')

semantic_embedding = pd.DataFrame(semantic_embedding)
semantic_embedding['embeddings'] = semantic_embedding['embeddings'].apply(ast.literal_eval)
embeddings_df = pd.DataFrame(semantic_embedding['embeddings'].tolist())

# Concatenate the embeddings with the original DataFrame, dropping the old embeddings column
semantic_embedding = pd.concat([semantic_embedding.drop(columns=['embeddings']), embeddings_df], axis=1)

# Drop the 'txt' column from the DataFrame
semantic_embedding.drop(columns=['txt'], inplace=True)
semantic_embedding_grouped = semantic_embedding.groupby('item_id').mean()
semantic_embedding.shape, semantic_embedding_grouped.shape


((50, 1025), (45, 1024))

In [12]:
#semantic_features_dict = semantic_embedding_grouped.set_index('item_id').T.to_dict('list')
semantic_features_dict=semantic_embedding_grouped.to_dict('index')
semantic_features_dict = {
    movie_id: list(features.values()) for movie_id, features in semantic_features_dict.items()
}

In [13]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds, sematic_features):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)
        self.sematic_features = sematic_features
        self.sematic_length = len(next(iter(semantic_features_dict.values())))

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        movie_id = self.items[idx].item()
        user_id = self.users[idx].item()
        if movie_id in self.sematic_features:
            semantic_feature = self.sematic_features.get(movie_id)
        else:
            semantic_feature = np.zeros(self.sematic_length)
        return {
        'user': torch.tensor(user_id, dtype=torch.long),
        'item': torch.tensor(movie_id, dtype=torch.long),
        'label': torch.tensor(self.labels[idx], dtype=torch.float),
        'semantic_feature': torch.tensor(semantic_feature, dtype=torch.float)
    }
    
    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)


In [14]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, semantic_embedding, ratings, 
                 all_movieIds, embedding_dim = 8, semantic_dim = 1024):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)
        
        self.semantic_transform = nn.Linear(semantic_dim, embedding_dim)

        self.fc1 = nn.Linear(in_features=embedding_dim * 3, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)

        self.ratings = ratings
        self.all_movieIds = all_movieIds
        self.semantic_embedding=semantic_embedding
        
    def forward(self, user_input, item_input, semantic_embedding_t):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        semantic_embedding_t = self.semantic_transform(semantic_embedding_t)


        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded, semantic_embedding_t], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input = batch["user"]
        item_input = batch["item"]
        semantic_input = batch["semantic_feature"]
        labels = batch["label"]
        
        predicted_labels = self(user_input, item_input, semantic_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds
                                                ,self.semantic_embedding),
                          batch_size=512, num_workers=0)

In [15]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()
print(num_users, num_items, all_movieIds.shape)

611 193610 (9724,)


In [16]:
model = NCF(num_users, num_items, ratings=train_ratings, semantic_embedding=semantic_features_dict, all_movieIds=all_movieIds)
trainer = pl.Trainer(max_epochs=7, devices="auto", reload_dataloaders_every_n_epochs=True,
                     enable_progress_bar=True, logger=False, enable_checkpointing=True)

trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\D.Joker\anaconda3\envs\tf\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:630: Checkpoint directory d:\Learning\movieLen\dataset\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type      | Params
-------------------------------------------------
0 | user_embedding     | Embedding | 4.9 K 
1 | item_embedding     | Embedding | 1.5 M 
2 | semantic_transform | Linear    | 8.2 K 
3 | fc1                | Linear    | 1.6 K 
4 | fc2                | Linear    | 2.1 K 
5 | output             | Linear    | 33    
-------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.263     Total estimated model params size (MB)
c:\Users\D.Joker\anaconda3\envs\tf\lib\site-packages\pytorch_lightning\tra

Training: |          | 0/? [00:00<?, ?it/s]

  'label': torch.tensor(self.labels[idx], dtype=torch.float),
`Trainer.fit` stopped: `max_epochs=7` reached.


In [18]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u, i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)

    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    semantic_features = [
        semantic_features_dict.get(item, np.zeros(1024)) for item in test_items
    ]

    user_tensor = torch.tensor([u] * 100, dtype=torch.long)
    item_tensor = torch.tensor(test_items, dtype=torch.long)
    semantic_tensor = torch.tensor(semantic_features, dtype=torch.float)

    predicted_labels = model(user_tensor, item_tensor, semantic_tensor).detach().numpy().squeeze()
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.6f}".format(np.average(hits)))

  0%|          | 0/610 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.591803


# Read Data

In [None]:
import json
import pandas as pd
max_lines = 30
current_line = 0
reviews = []

with open('movie_dataset_public_final/raw/reviews.json', 'r') as file:
    for line in file:
        #if current_line >= max_lines:
        #    break
        data = json.loads(line)
        reviews.append(data)
        current_line += 1
reviews = pd.DataFrame(reviews)
len(reviews)

In [None]:
# 假设 'all_movie_ids' 是一个包含所有电影 ID 的列表
ratings = pd.read_csv('ml-20m/ratings.csv')  # 读取评分数据
all_movie_ids = ratings['movieId'].unique()  # 示例电影 ID 列表

# 获取当前 'reviews' 中出现的 'item_id'
present_item_ids = reviews['item_id'].unique()

# 找出没有出现的 item_id
missing_item_ids = set(all_movie_ids) - set(present_item_ids)

if missing_item_ids:
    print(f"Missing item_ids: {missing_item_ids}")
else:
    print("All item_ids are present.")

In [None]:
with open('movie_dataset_public_final/raw/metadata.json', 'r') as file:
    metadata = [json.loads(line) for line in file]
metadata = pd.DataFrame(metadata)
metadata.head()
with open('movie_dataset_public_final/raw/survey_answers.json', 'r') as file:
    survey_answers = [json.loads(line) for line in file]
survey_answers = pd.DataFrame(survey_answers)
survey_answers.head()


In [None]:
current_line = 0
ratings = []
with open('movie_dataset_public_final/raw/ratings.json', 'r') as file:
    for line in file:
        if current_line >= max_lines:
            break
        data = json.loads(line)
        ratings.append(data)
        current_line += 1
ratings = pd.DataFrame(ratings)
ratings.head()

命中率 (Hit Rate)
覆盖率 (Coverage)
NDCG (Normalized Discounted Cumulative Gain)
AUC (Area Under the Curve)