In [1]:
import numpy as np
import pandas as pd
from os import path
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import roc_auc_score

torch.__version__

'2.2.1+cu121'

In [2]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [3]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2495b6354d0>

In [4]:
DIR_DATA = 'data'
DIR_MODEL = 'models'

### Get data

In [5]:
# Load data
#df_full = pd.read_csv('data_int.csv')
df_full = pd.read_csv('test.csv')
df_full.head(10)

Unnamed: 0,user,business,rating,date
0,0,1481,5,1479040537
1,0,8854,5,1398704639
2,1,7950,4,1301177737
3,1,12413,5,1301177343
4,2,7030,2,1497361148
5,3,1217,3,1338848137
6,3,1983,2,1337621193
7,3,3313,3,1339444620
8,3,4455,3,1337294136
9,3,5005,2,1340650648


### Build the references

I'm planning to use the `Embedding` layer, so I need to link real ids of the users and movies to the order ones.

In [6]:
sorted(df_full.business.unique())[-10:]

[14570, 14571, 14572, 14574, 14576, 14580, 14582, 14583, 14584, 14585]

In [7]:
sorted(df_full.user.unique())[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
unique_users = df_full.user.unique()
user_ids = {u: i for i, u in enumerate(unique_users)}

unique_businesses = df_full.business.unique()
business_ids = {m: i for i, m in enumerate(unique_businesses)}

In [9]:
df_full['user_id'] = df_full.user.map(user_ids)
df_full['business_id'] = df_full.business.map(business_ids)

df_full.head(10)

Unnamed: 0,user,business,rating,date,user_id,business_id
0,0,1481,5,1479040537,0,0
1,0,8854,5,1398704639,0,1
2,1,7950,4,1301177737,1,2
3,1,12413,5,1301177343,1,3
4,2,7030,2,1497361148,2,4
5,3,1217,3,1338848137,3,5
6,3,1983,2,1337621193,3,6
7,3,3313,3,1339444620,3,7
8,3,4455,3,1337294136,3,8
9,3,5005,2,1340650648,3,9


### Train/test split

Here the main idea is to extract some movies for users who have a big amount of positive reviews into the test subtest. I extract 2 movies for each user who have more than 20 positive reviews. This test subset won't be used during training, but these movies should appear in the top recommendations for each user accordingly.

#### Test subset

In [10]:
tmp_test = df_full[df_full.rating > 4]
tmp_test = tmp_test.groupby('user').business.count().reset_index()
tmp_test.shape

(9101, 2)

In [11]:
conditions = (df_full.user.isin(tmp_test[tmp_test.business > 20].user)) & (df_full.rating > 4)
df_test = df_full[conditions].groupby('user').head(2).reset_index()

del df_test['index']
df_test.shape

(176, 6)

In [12]:
ground_truth_test = df_test.groupby('user_id').business_id.agg(list).reset_index()
ground_truth_test.head(10)

Unnamed: 0,user_id,business_id
0,253,"[498, 93]"
1,1125,"[1986, 1989]"
2,1202,"[2296, 2233]"
3,1511,"[483, 487]"
4,1633,"[1506, 1569]"
5,1692,"[964, 995]"
6,1794,"[500, 2883]"
7,1957,"[3186, 3187]"
8,2170,"[360, 292]"
9,2185,"[1453, 769]"


#### Training subset

In [13]:
df_train = pd.concat([df_full, df_test]).drop_duplicates(keep=False)
df_train.shape

(49822, 6)

In [14]:
ground_truth_train = df_train[df_train.rating > 3].groupby('user_id').business_id.agg(list).reset_index()
ground_truth_train.head(10)

Unnamed: 0,user_id,business_id
0,0,"[0, 1]"
1,1,"[2, 3]"
2,3,"[10, 12]"
3,4,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2..."
4,5,"[30, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 4..."
5,6,"[47, 48]"
6,7,"[49, 50]"
7,8,"[51, 52, 53, 54, 55]"
8,9,[56]
9,10,"[57, 58]"


### Building triplets

Bayers Personalized Ranking requires for the training a triplet of the user, positive item and negative item. For each user, I create a pair of each positive ranked movie (the rank is higher than 3) with all negative movies (the rank is equal  3 and lower than).

In [15]:
df_triplets = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])

df_triplets.head(10)

Unnamed: 0,user_id,positive_m_id,negative_m_id


In [16]:
%%time

data = []
users_without_data = []

for user_id in tqdm(df_train.user_id.unique()):
    positive_businesss = df_train[(df_train.user_id == user_id) & (df_train.rating > 3)].business_id.values
    negative_businesss = df_train[(df_train.user_id == user_id) & (df_train.rating <= 3)].business_id.values

    if negative_businesss.shape[0] == 0 or positive_businesss.shape[0] == 0:
        users_without_data.append(user_id)
        continue


    for positive_business in positive_businesss:
        for negative_business in negative_businesss:
            data.append({'user_id': user_id, 'positive_m_id': positive_business, 'negative_m_id': negative_business})



100%|██████████████████████████████████████████████████████████████████████████| 14469/14469 [00:05<00:00, 2822.22it/s]

CPU times: total: 3.25 s
Wall time: 5.13 s





In [17]:
df_triplets = pd.DataFrame(data)

In [18]:
df_triplets.shape, df_train.shape
df_triplets.head(10)

Unnamed: 0,user_id,positive_m_id,negative_m_id
0,3,10,5
1,3,10,6
2,3,10,7
3,3,10,8
4,3,10,9
5,3,10,11
6,3,12,5
7,3,12,6
8,3,12,7
9,3,12,8


### BPR NN

In [19]:
num_users = len(unique_users)
num_items = len(unique_businesses)

num_users, num_items

(14470, 9396)

In [20]:
unique_business_ids = list(df_full.business_id.unique())

In [21]:
# item_ids의 데이터 유형을 확인합니다.
item_ids_type = type(unique_businesses)
print("item_ids 데이터 유형:", item_ids_type)

# item_ids의 값 범위를 확인합니다.
min_item_id = np.min(unique_businesses)
max_item_id = np.max(unique_businesses)
print("item_ids 최소값:", min_item_id)
print("item_ids 최대값:", max_item_id)

# 모델이 처리할 수 있는 범위 내의 아이템 ID 범위를 확인합니다.
model_min_item_id = 0  # 모델이 처리할 수 있는 아이템 ID 최소값
model_max_item_id = num_items - 1  # 모델이 처리할 수 있는 아이템 ID 최대값
print("모델이 처리할 수 있는 아이템 ID 최소값:", model_min_item_id)
print("모델이 처리할 수 있는 아이템 ID 최대값:", model_max_item_id)

# 데이터 값 범위가 모델이 처리할 수 있는 범위 내에 있는지 확인합니다.
if min_item_id >= model_min_item_id and max_item_id <= model_max_item_id:
    print("데이터 값 범위가 모델이 처리할 수 있는 범위 내에 있습니다.")
else:
    print("경고: 데이터 값 범위가 모델이 처리할 수 있는 범위를 벗어납니다.")


item_ids 데이터 유형: <class 'numpy.ndarray'>
item_ids 최소값: 1
item_ids 최대값: 14585
모델이 처리할 수 있는 아이템 ID 최소값: 0
모델이 처리할 수 있는 아이템 ID 최대값: 9395
경고: 데이터 값 범위가 모델이 처리할 수 있는 범위를 벗어납니다.


### Build a model

In [22]:
def bpr_predict(model, user_id, item_ids, user_layer='user_embedding', item_layer='item_embedding'):
    """
    Predict by multiplication user vector by item matrix

    :return: list of the scores
    """
    user_vector = model.state_dict()[user_layer].cpu().numpy()[user_id]
    item_matrix = model.state_dict()[item_layer].cpu().numpy()[item_ids]

    scores = np.dot(user_vector, item_matrix.T)

    return scores

In [23]:
class IdentityLoss(nn.Module):
    def __init__(self):
        super(IdentityLoss, self).__init__()

    def forward(self, y_pred):
        return torch.mean(y_pred)

class BPRTripletLoss(nn.Module):
    def __init__(self):
        super(BPRTripletLoss, self).__init__()

    def forward(self, positive_item_latent, negative_item_latent, user_latent):
        positive_interactions = torch.sum(user_latent * positive_item_latent, dim=-1, keepdim=True)
        negative_interactions = torch.sum(user_latent * negative_item_latent, dim=-1, keepdim=True)

        return torch.sub(1.0, torch.sigmoid(torch.sub(positive_interactions, negative_interactions)))


In [24]:
class BPRModel(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super(BPRModel, self).__init__()

        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)

    def forward(self, positive_item_input, negative_item_input, user_input):
        positive_item_embedding = self.item_embedding(positive_item_input).view(positive_item_input.shape[0], -1)
        negative_item_embedding = self.item_embedding(negative_item_input).view(negative_item_input.shape[0], -1)
        user_embedding = self.user_embedding(user_input).view(user_input.shape[0], -1)

        return positive_item_embedding, negative_item_embedding, user_embedding


In [25]:
latent_dim = 350
num_epochs = 1
lr = 0.001

model = BPRModel(num_users, num_items, latent_dim)
loss_fn = IdentityLoss()
triplet_loss_fn = BPRTripletLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

print('Total number of parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))

Total number of parameters: 8353100


In [26]:
user_input_tensor = torch.tensor(df_triplets.user_id.values, dtype=torch.long)
positive_item_input_tensor = torch.tensor(df_triplets.positive_m_id.values, dtype=torch.long)
negative_item_input_tensor = torch.tensor(df_triplets.negative_m_id.values, dtype=torch.long)

In [27]:
for epoch in range(num_epochs):
    optimizer.zero_grad()
    positive_item_embedding, negative_item_embedding, user_embedding = model(positive_item_input_tensor, negative_item_input_tensor, user_input_tensor)
    loss = triplet_loss_fn(positive_item_embedding, negative_item_embedding, user_embedding)
    loss = loss_fn(loss)
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


Epoch [1/1], Loss: 0.5041916966438293


### Evaluation

In [28]:
def full_auc(model: BPRModel, ground_truth: Dict[int, list], items: list) -> float:
    """
    Measure AUC for model and ground truth for all items

    :param model:
    :param ground_truth: dictionary of the users and the high ranked movies for the specific user
    :param items: a list of the all available movies
    :return: AUC
    """

    number_of_items = len(items)
    scores = []

    for user_id, true_item_ids in ground_truth:
        predictions = bpr_predict(model, user_id, items)
        grnd = np.zeros(number_of_items, dtype=np.int32)

        for p in true_item_ids:
            index = items.index(p)
            grnd[index] = 1
            

        if true_item_ids:
            scores.append(roc_auc_score(grnd, predictions))

        print(grnd);

    return sum(scores) / len(scores)

In [29]:
def mean_average_precision_k(model: BPRModel,
                           ground_truth: Dict[int, list],
                           items: list,
                           k=100) -> float:
    """
    Calculate mean eavarage precission per user

    :param model:
    :param ground_truth: dictionary of the users and the high ranked movies for the specific user
    :param items: a list of the all available movies
    :param k: top N recommendations per user
    :return: mean eavarage precission
    """
    scores = []

    for user, actual in ground_truth:
        predictions = bpr_predict(model, user, items)
        predictions = dict(zip(items, predictions))
        predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:k]
        predictions = list(OrderedDict(predictions).keys())

        score = 0.0
        num_hits = 0.0

        for i, p in enumerate(predictions):
            if p in actual:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        score = score / min(len(actual), k)
        scores.append(score)

    return np.mean(scores)

### Train

In [30]:
print(f'AUC train: {full_auc(model, ground_truth_train.values, unique_business_ids)}')
print(f'Mean average precision train: {mean_average_precision_k(model, ground_truth_train.values, unique_business_ids)}')

KeyError: 'user_embedding'

### Test

In [None]:
print(f'AUC test: {full_auc(model, ground_truth_test.values, unique_businesses_ids)}')
print(f'Mean average precision test: {mean_average_precision_k(model, ground_truth_test.values, unique_businesses_ids)}')