In [1]:
import warnings 
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertConfig, BertModel
from tqdm import tqdm
import random
import os
import datetime

# Фиксируем seed для воспроизводимости
seed = 1001 
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(seed)

print(f"Using seed: {seed}")

Using seed: 1001


In [3]:
interactions = pd.read_csv('interactions_train.csv')
users = pd.read_csv('users.csv')
items = pd.read_csv('items.csv')
# test_users = pd.read_csv('users_public_test.csv')
test_users = pd.read_csv('users_private_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

print("Interactions shape:", interactions.shape)
print("Users shape:", users.shape)
print("Items shape:", items.shape)
print("Test users shape:", test_users.shape)

Interactions shape: (922967, 5)
Users shape: (840197, 5)
Items shape: (15963, 13)
Test users shape: (198636, 1)


In [4]:
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt'])
interactions = interactions.sort_values(by=['user_id', 'last_watch_dt'])

user_sequences = interactions.groupby('user_id')['item_id'].apply(list).reset_index()

user_sequences = user_sequences.merge(users, on='user_id', how='left')

items['text'] = items['title'].fillna('') + ' ' + items['genres'].fillna('') + ' ' + items['countries'].fillna('') + ' ' + items['directors'].fillna('') + ' ' + items['actors'].fillna('') + ' ' + items['keywords'].fillna('')

item_encoder = LabelEncoder()
all_items = interactions['item_id'].unique()
item_encoder.fit(all_items)
num_items = len(item_encoder.classes_)

user_sequences['encoded_seq'] = user_sequences['item_id'].apply(lambda x: item_encoder.transform(x))

In [5]:
class BERT4Rec(nn.Module):
    def __init__(self, num_items, embed_dim=128//2, num_layers=2, num_heads=2):
        super().__init__()
        self.item_embedding = nn.Embedding(num_items + 2, embed_dim)  
        config = BertConfig(
            vocab_size=num_items + 2,
            hidden_size=embed_dim,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            intermediate_size=embed_dim * 4,
            max_position_embeddings=512
        )
        self.transformer = BertModel(config)
        self.fc = nn.Linear(embed_dim, num_items)
    
    def forward(self, input_ids, labels=None):
        embeds = self.item_embedding(input_ids)
        outputs = self.transformer(inputs_embeds=embeds)
        logits = self.fc(outputs.last_hidden_state)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, num_items), labels.view(-1))
        
        return logits, loss

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERT4Rec(num_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
test_sequences = test_users.merge(interactions.groupby('user_id')['item_id'].apply(list), on='user_id', how='left')
test_sequences['item_id'] = test_sequences['item_id'].fillna("").apply(lambda x: x if isinstance(x, list) else [])
test_sequences['encoded_seq'] = test_sequences['item_id'].apply(lambda x: item_encoder.transform(x).tolist() if x else [])
model_path = f"bert4rec_seed{seed}.pth"

class TestDataset(Dataset):
    def __init__(self, sequences, max_len=50):
        self.sequences = sequences
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences.iloc[idx]['encoded_seq']  
        user_id = self.sequences.iloc[idx]['user_id']
        if len(seq) > self.max_len - 1:
            seq = seq[-(self.max_len - 1):]
        else:
            seq = [0] * (self.max_len - 1 - len(seq)) + seq
        seq.append(num_items)  
        return {
            'user_id': user_id,
            'input_ids': torch.tensor(seq)
        }

test_ds = TestDataset(test_sequences)
test_dl = DataLoader(test_ds, batch_size=32)

model.load_state_dict(torch.load(model_path))
model.eval()

predictions = []
with torch.no_grad():
    for batch in tqdm(test_dl):
        input_ids = batch['input_ids'].to(device)
        logits, _ = model(input_ids)
        last_logits = logits[:, -1, :]  
        topk = torch.topk(last_logits, k=10, dim=-1).indices.cpu().numpy()
        for u, tops in zip(batch['user_id'], topk):
            top_items = item_encoder.inverse_transform(tops)
            predictions.append([u] + list(top_items))

sub_df = pd.DataFrame(predictions, columns=['user_id'] + [str(i) for i in range(10)])
sub_df.to_csv(f"submission_seed{seed}.csv", index=False)
print("Файл с предсказаниями создан.")

 71%|███████   | 4380/6208 [01:21<00:39, 46.38it/s]