In [1]:
import numpy as np
import pandas as pd
from os import path
from collections import OrderedDict
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import roc_auc_score

torch.__version__

'2.2.1+cu121'

In [2]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [3]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1c788c35490>

In [4]:
DIR_DATA = 'data'
DIR_MODEL = 'models'

### Get data

In [5]:
# Load data
#df_full = pd.read_csv('data_int.csv')
df_full = pd.read_csv('test.csv')
df_full.head(10)

Unnamed: 0,user,business,rating,date
0,0,1481,5,1479040537
1,0,8854,5,1398704639
2,1,7950,4,1301177737
3,1,12413,5,1301177343
4,2,7030,2,1497361148
5,3,1217,3,1338848137
6,3,1983,2,1337621193
7,3,3313,3,1339444620
8,3,4455,3,1337294136
9,3,5005,2,1340650648


### Build the references

I'm planning to use the `Embedding` layer, so I need to link real ids of the users and movies to the order ones.

In [6]:
sorted(df_full.business.unique())[-10:]

[14570, 14571, 14572, 14574, 14576, 14580, 14582, 14583, 14584, 14585]

In [7]:
sorted(df_full.user.unique())[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
unique_users = df_full.user.unique()
user_ids = {u: i for i, u in enumerate(unique_users)}

unique_businesses = df_full.business.unique()
business_ids = {m: i for i, m in enumerate(unique_businesses)}

In [9]:
df_full['user_id'] = df_full.user.map(user_ids)
df_full['business_id'] = df_full.business.map(business_ids)

df_full.head(10)

Unnamed: 0,user,business,rating,date,user_id,business_id
0,0,1481,5,1479040537,0,0
1,0,8854,5,1398704639,0,1
2,1,7950,4,1301177737,1,2
3,1,12413,5,1301177343,1,3
4,2,7030,2,1497361148,2,4
5,3,1217,3,1338848137,3,5
6,3,1983,2,1337621193,3,6
7,3,3313,3,1339444620,3,7
8,3,4455,3,1337294136,3,8
9,3,5005,2,1340650648,3,9


### Train/test split

Here the main idea is to extract some movies for users who have a big amount of positive reviews into the test subtest. I extract 2 movies for each user who have more than 20 positive reviews. This test subset won't be used during training, but these movies should appear in the top recommendations for each user accordingly.

#### Test subset

In [10]:
tmp_test = df_full[df_full.rating > 4]
tmp_test = tmp_test.groupby('user').business.count().reset_index()
tmp_test.shape

(9101, 2)

In [11]:
conditions = (df_full.user.isin(tmp_test[tmp_test.business > 20].user)) & (df_full.rating > 4)
df_test = df_full[conditions].groupby('user').head(2).reset_index()

del df_test['index']
df_test.shape

(176, 6)

In [12]:
ground_truth_test = df_test.groupby('user_id').business_id.agg(list).reset_index()
ground_truth_test.head(10)

Unnamed: 0,user_id,business_id
0,253,"[498, 93]"
1,1125,"[1986, 1989]"
2,1202,"[2296, 2233]"
3,1511,"[483, 487]"
4,1633,"[1506, 1569]"
5,1692,"[964, 995]"
6,1794,"[500, 2883]"
7,1957,"[3186, 3187]"
8,2170,"[360, 292]"
9,2185,"[1453, 769]"


#### Training subset

In [13]:
df_train = pd.concat([df_full, df_test]).drop_duplicates(keep=False)
df_train.shape

(49822, 6)

In [14]:
ground_truth_train = df_train[df_train.rating > 3].groupby('user_id').business_id.agg(list).reset_index()
ground_truth_train.head(10)

Unnamed: 0,user_id,business_id
0,0,"[0, 1]"
1,1,"[2, 3]"
2,3,"[10, 12]"
3,4,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2..."
4,5,"[30, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 4..."
5,6,"[47, 48]"
6,7,"[49, 50]"
7,8,"[51, 52, 53, 54, 55]"
8,9,[56]
9,10,"[57, 58]"


### Building triplets

Bayers Personalized Ranking requires for the training a triplet of the user, positive item and negative item. For each user, I create a pair of each positive ranked movie (the rank is higher than 3) with all negative movies (the rank is equal  3 and lower than).

In [15]:
df_triplets = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])

df_triplets

Unnamed: 0,user_id,positive_m_id,negative_m_id


In [16]:
%%time

data = []
users_without_data = []

for user_id in tqdm(df_train.user_id.unique()):
    positive_businesss = df_train[(df_train.user_id == user_id) & (df_train.rating > 3)].business_id.values
    negative_businesss = df_train[(df_train.user_id == user_id) & (df_train.rating <= 3)].business_id.values

    if negative_businesss.shape[0] == 0 or positive_businesss.shape[0] == 0:
        users_without_data.append(user_id)
        continue


    for positive_business in positive_businesss:
        for negative_business in negative_businesss:
            data.append({'user_id': user_id, 'positive_m_id': positive_business, 'negative_m_id': negative_business})



100%|██████████████████████████████████████████████████████████████████████████| 14469/14469 [00:06<00:00, 2327.70it/s]

CPU times: total: 3.02 s
Wall time: 6.22 s





In [17]:
df_triplets = pd.concat([df_triplets, pd.DataFrame(data)], ignore_index=True)

In [18]:
df_triplets.shape, df_train.shape

((380451, 3), (49822, 6))

### BPR NN

In [19]:
num_users = len(unique_users)
num_items = len(unique_businesses)

### Build a model

In [20]:
class BPR(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super(BPR, self).__init__()
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)

    def forward(self, user_input, positive_item_input, negative_item_input):
        user_embedding = self.user_embedding(user_input)
        positive_item_embedding = self.item_embedding(positive_item_input)
        negative_item_embedding = self.item_embedding(negative_item_input)

        positive_interactions = torch.sum(user_embedding * positive_item_embedding, dim=1)
        negative_interactions = torch.sum(user_embedding * negative_item_embedding, dim=1)

        return torch.sigmoid(positive_interactions - negative_interactions)

latent_dim = 350
model = BPR(num_users, num_items, latent_dim).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


### Train the model

In [21]:
X_train = torch.tensor(df_triplets[['user_id', 'positive_m_id', 'negative_m_id']].values.astype(np.int64), dtype=torch.long).to(device)  # 입력 데이터를 GPU에 로드
y_train = torch.ones(len(df_triplets)).to(device)

In [22]:
num_epochs = 1
batch_size = 256

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        user_input = batch_X[:, 0]
        positive_item_input = batch_X[:, 1]
        negative_item_input = batch_X[:, 2]

        optimizer.zero_grad()
        outputs = model(user_input, positive_item_input, negative_item_input)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

### Evaluation

In [23]:
def full_auc(model, ground_truth, items):
    scores = []

    for user_id, true_item_ids in ground_truth:
        predictions = bpr_predict(model, user_id, items)
        grnd = np.zeros(len(items), dtype=np.int32)

        for p in true_item_ids:
            index = items.index(p)
            grnd[index] = 1

        if true_item_ids:
            scores.append(roc_auc_score(grnd, predictions))

    return sum(scores) / len(scores)

In [24]:
def mean_average_precision_k(model, ground_truth, items, k=100):
    scores = []

    for user, actual in ground_truth:
        predictions = bpr_predict(model, user, items)
        predictions = dict(zip(items, predictions))
        predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:k]
        predictions = list(OrderedDict(predictions).keys())

        score = 0.0
        num_hits = 0.0

        for i, p in enumerate(predictions):
            if p in actual:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        score = score / min(len(actual), k)
        scores.append(score)

    return np.mean(scores)

In [25]:
def bpr_predict(model, user_id, item_ids):
    user_embedding = model.user_embedding(torch.tensor([user_id]).to(device))
    item_embeddings = model.item_embedding(torch.tensor(item_ids).to(device))

    scores = torch.sigmoid(torch.matmul(user_embedding, item_embeddings.T))

    return scores.detach().cpu().numpy().flatten()

### Train

In [26]:
print(f'AUC train: {full_auc(model, ground_truth_train.values, unique_businesses)}')
print(f'Mean average precision train: {mean_average_precision_k(model, ground_truth_train.values, unique_businesses)}')

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

### Test

In [None]:
print(f'AUC test: {full_auc(model, ground_truth_test.values, unique_businesses)}')
print(f'Mean average precision test: {mean_average_precision_k(model, ground_truth_test.values, unique_businesses)}')