In [1]:
import warnings 
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertConfig, BertModel
from tqdm import tqdm
import random
import os
import datetime

# Фиксируем seed для воспроизводимости
seed = 1001 
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)

print(f"Using seed: {seed}")

Using seed: 1001


In [3]:
interactions = pd.read_csv('interactions_train.csv')
users = pd.read_csv('users.csv')
items = pd.read_csv('items.csv')
test_users = pd.read_csv('users_public_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

print("Interactions shape:", interactions.shape)
print("Users shape:", users.shape)
print("Items shape:", items.shape)
print("Test users shape:", test_users.shape)

Interactions shape: (922967, 5)
Users shape: (840197, 5)
Items shape: (15963, 13)
Test users shape: (198636, 1)


In [4]:
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt'])
interactions = interactions.sort_values(by=['user_id', 'last_watch_dt'])

user_sequences = interactions.groupby('user_id')['item_id'].apply(list).reset_index()

user_sequences = user_sequences.merge(users, on='user_id', how='left')

items['text'] = items['title'].fillna('') + ' ' + items['genres'].fillna('') + ' ' + items['countries'].fillna('') + ' ' + items['directors'].fillna('') + ' ' + items['actors'].fillna('') + ' ' + items['keywords'].fillna('')

item_encoder = LabelEncoder()
all_items = interactions['item_id'].unique()
item_encoder.fit(all_items)
num_items = len(item_encoder.classes_)

user_sequences['encoded_seq'] = user_sequences['item_id'].apply(lambda x: item_encoder.transform(x))

In [5]:
class RecDataset(Dataset):
    def __init__(self, sequences, max_len=50):
        self.sequences = sequences
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences.iloc[idx]['encoded_seq'].tolist()
        if len(seq) > self.max_len:
            seq = seq[-self.max_len:]
        else:
            seq = [0] * (self.max_len - len(seq)) + seq  
        
        masked_seq = seq.copy()
        labels = [-100] * len(seq)  
        mask_pos = random.randint(0, len(seq)-1)
        labels[mask_pos] = seq[mask_pos]
        masked_seq[mask_pos] = num_items 
        
        return {
            'input_ids': torch.tensor(masked_seq),
            'labels': torch.tensor(labels)
        }

train_seq, val_seq = train_test_split(user_sequences, test_size=0.2, random_state=seed)

train_ds = RecDataset(train_seq)
val_ds = RecDataset(val_seq)

train_dl = DataLoader(train_ds, batch_size=32, num_workers = 0)
val_dl = DataLoader(val_ds, batch_size=32, num_workers = 0)

In [6]:
next(iter(train_dl))

{'input_ids': tensor([[    0,     0,     0,  ...,  2590,  5187,  4498],
         [    0,     0,     0,  ...,  1701, 11856,   950],
         [    0,     0,     0,  ...,  4497, 11856, 10044],
         ...,
         [    0,     0,     0,  ..., 11856,     0,    94],
         [    0,     0,     0,  ..., 10533,  2868,  7488],
         [    0,     0,     0,  ...,  5677,  5187,  9705]]),
 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100, 10587,  -100],
         [ -100,  -100,  -100,  ...,  -100,  7415,  -100],
         ...,
         [ -100,  -100,  -100,  ...,     0,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100]])}

In [7]:
next(iter(val_dl))

{'input_ids': tensor([[    0,     0,     0,  ...,     0,     0,  5187],
         [    0,     0,     0,  ...,  9508,  8546,  1631],
         [    0,     0,     0,  ...,  3672,  2710,  5187],
         ...,
         [    0,     0,     0,  ...,     0,     0,  2590],
         [    0,     0,     0,  ...,  6994, 11127,   342],
         [    0,     0,     0,  ...,  6437,  6973, 11856]]),
 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         ...,
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, 2597]])}

In [8]:
class BERT4Rec(nn.Module):
    def __init__(self, num_items, embed_dim=128//2, num_layers=2, num_heads=2):
        super().__init__()
        self.item_embedding = nn.Embedding(num_items + 2, embed_dim)  
        config = BertConfig(
            vocab_size=num_items + 2,
            hidden_size=embed_dim,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            intermediate_size=embed_dim * 4,
            max_position_embeddings=512
        )
        self.transformer = BertModel(config)
        self.fc = nn.Linear(embed_dim, num_items)
    
    def forward(self, input_ids, labels=None):
        embeds = self.item_embedding(input_ids)
        outputs = self.transformer(inputs_embeds=embeds)
        logits = self.fc(outputs.last_hidden_state)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, num_items), labels.view(-1))
        
        return logits, loss

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERT4Rec(num_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train_epoch(dl):
    model.train()
    total_loss = 0
    for batch in tqdm(dl):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        _, loss = model(input_ids, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dl)

def val_epoch(dl):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dl):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            _, loss = model(input_ids, labels)
            total_loss += loss.item()
    return total_loss / len(dl)

for epoch in range(5): 
    train_loss = train_epoch(train_dl)
    val_loss = val_epoch(val_dl)
    print(f"Epoch {epoch}: Train Loss {train_loss}, Val Loss {val_loss}\n")

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"bert4rec_seed{seed}.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

100%|██████████| 7125/7125 [00:55<00:00, 127.28it/s]
100%|██████████| 1782/1782 [00:05<00:00, 310.15it/s]


Epoch 0: Train Loss 0.8210117305652483, Val Loss 0.5314941075413595



100%|██████████| 7125/7125 [00:56<00:00, 126.00it/s]
100%|██████████| 1782/1782 [00:08<00:00, 212.16it/s]


Epoch 1: Train Loss 0.5324103142973403, Val Loss 0.5301605664804467



100%|██████████| 7125/7125 [00:49<00:00, 142.71it/s]
100%|██████████| 1782/1782 [00:06<00:00, 293.97it/s]


Epoch 2: Train Loss 0.5207085148772049, Val Loss 0.538770653993007



100%|██████████| 7125/7125 [00:49<00:00, 143.13it/s]
100%|██████████| 1782/1782 [00:05<00:00, 302.25it/s]


Epoch 3: Train Loss 0.515234497968373, Val Loss 0.5182544456437886



100%|██████████| 7125/7125 [01:22<00:00, 85.87it/s] 
100%|██████████| 1782/1782 [00:06<00:00, 294.43it/s]

Epoch 4: Train Loss 0.5190841801961621, Val Loss 0.5300840243060329

Model saved to bert4rec_seed1001.pth



