In [2]:
# -*- coding: utf-8 -*-
"""baseline_qwen.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1dvfywmCg9ng6d4tl9QvsRQuUhaiV489o
"""

import os
import time
import json
import pickle
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.amp import autocast, GradScaler
from tqdm import tqdm  # ❌ Не используем auto для избежания виджетов

#from google.colab import drive
#drive.mount('/content/drive')

#DATA_DIR = "/content/drive/MyDrive/AvitoTechML25/Data"
#CACHE_DIR = "/content/drive/MyDrive/AvitoTechML25/cache"
DATA_DIR = "Data"
CACHE_DIR = "cache2"

os.makedirs(CACHE_DIR, exist_ok=True)

PREDICTIONS_FILE = os.path.join(CACHE_DIR, "predictions.csv")
ITEM_EMBEDDINGS_FILE = os.path.join(CACHE_DIR, "item_embeddings.pkl")
BEST_MODEL_PATH = os.path.join(CACHE_DIR, "best_model.pth")

# ==============================
# 🔧 Основные параметры
# ==============================
IS_DEBUG = True
DEBUG_SAMPLE_PERCENT = 0.1
PATIENCE = 3

# Включаем синхронную работу с GPU
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.autograd.set_detect_anomaly(True)

# ==============================
# 🕒 Логирование с временем
# ==============================
import builtins
def tprint(*args, **kwargs):
    current_time = time.strftime("%Y-%m-%d %H:%M:%S")
    builtins.print(f"[{current_time}]", *args, **kwargs)
print = tprint

# ==============================
# 🧠 1. Загрузка данных
# ==============================
import pyarrow.parquet as pq
import os

def load_data(fraction=0.1):
    print("Загрузка 10% данных...")

    def load_fraction(path, fraction):
        table = pq.read_table(path)
        num_rows = int(len(table) * fraction)
        return table.slice(0, num_rows).to_pandas()

    data = {
        'clickstream': load_fraction(os.path.join(DATA_DIR, "clickstream.pq"), fraction),
        'cat_features': load_fraction(os.path.join(DATA_DIR, "cat_features.pq"), fraction),
        'events': load_fraction(os.path.join(DATA_DIR, "events.pq"), fraction),
        'test_users': load_fraction(os.path.join(DATA_DIR, "test_users.pq"), fraction),
    }

    return data

t_start = time.time()
data = load_data()

# ==============================
# 🔍 2. Подготовка пар (user, node)
# ==============================
def prepare_pairs(clickstream, events):
    print("Формирование пар (user, node)...")
    contact_events = events[events['is_contact'] == 1]['event'].unique()
    clickstream['is_contact'] = clickstream['event'].isin(contact_events).astype(int)
    grouped = clickstream.groupby(['cookie', 'node'], as_index=False)['is_contact'].sum()
    grouped['target'] = (grouped['is_contact'] > 0).astype(int)
    return grouped

grouped = prepare_pairs(data['clickstream'], data['events'])

# ==============================
# 🧪 Отладочный режим: выборка из N%
# ==============================
def sample_data(grouped, fraction=DEBUG_SAMPLE_PERCENT):
    print(f"Отладка: оставляется {int(fraction * 100)}% данных...")
    users_sampled = grouped.sample(frac=fraction, random_state=42)['cookie'].unique()
    return grouped[grouped['cookie'].isin(users_sampled)]

# ==============================
# 🔢 Кодирование пользователей и товаров
# ==============================
def encode_user_item(grouped):
    print("Кодирование пользователей и товаров...")
    le_user = LabelEncoder()
    le_item = LabelEncoder()

    # Обработка возможных NaN
    grouped['cookie'] = grouped['cookie'].fillna('unknown')
    grouped['node'] = grouped['node'].fillna('unknown')

    grouped['user_id'] = le_user.fit_transform(grouped['cookie'])
    grouped['item_id'] = le_item.fit_transform(grouped['node'])

    num_users = grouped['user_id'].nunique()
    num_items = grouped['item_id'].nunique()

    return grouped, num_users, num_items, le_user, le_item

if IS_DEBUG:
  grouped = sample_data(grouped)

grouped, num_users, num_items, le_user, le_item = encode_user_item(grouped)

print(f"num_users: {num_users}, num_items: {num_items}")
print(f"Максимальный user_id: {grouped['user_id'].max()}, Максимальный item_id: {grouped['item_id'].max()}")

# ==============================
# 🧪 Разделение на train/val
# ==============================
def create_train_val_split(grouped, val_size=0.2, random_state=42):
    print("Разделение на train/val...")
    users = grouped['user_id'].unique()
    train_users, val_users = train_test_split(users, test_size=val_size, random_state=random_state)
    train_mask = grouped['user_id'].isin(train_users)
    val_mask = grouped['user_id'].isin(val_users)
    return grouped[train_mask], grouped[val_mask]

# ==============================
# 🧮 Two-Tower модель
# ==============================
class TwoTower(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=256):
        super().__init__()
        self.user_emb = nn.Embedding(num_users + 2, embed_dim)
        self.item_emb = nn.Embedding(num_items + 2, embed_dim)

        self.user_tower = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, embed_dim)
        )

        self.item_tower = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, embed_dim)
        )

    def forward(self, users, items):
        u = self.user_tower(self.user_emb(users))
        i = self.item_tower(self.item_emb(items))
        return torch.sum(u * i, dim=-1)

    def get_user_vector(self, users):
        return self.user_tower(self.user_emb(users))

    def get_item_vector(self, items):
        return self.item_tower(self.item_emb(items))

# ==============================
# 🏋️‍♂️ 3. Обучение модели с валидацией
# ==============================
def train_model_with_validation(grouped, num_users, num_items):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Используется устройство: {device}")

    model = TwoTower(num_users, num_items).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
    criterion = nn.BCEWithLogitsLoss()

    # Если модель уже есть — загружаем
    if os.path.exists(BEST_MODEL_PATH):
        print("Загрузка обученной модели из кэша...")
        model.load_state_dict(torch.load(BEST_MODEL_PATH))
        return model, device

    # Подготовка данных
    train_data, val_data = create_train_val_split(grouped)
    X_train = torch.tensor(train_data[['user_id', 'item_id']].values, dtype=torch.long)
    y_train = torch.tensor(train_data['target'].values, dtype=torch.float)
    X_val = torch.tensor(val_data[['user_id', 'item_id']].values, dtype=torch.long)
    y_val = torch.tensor(val_data['target'].values, dtype=torch.float)

    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

    scaler = GradScaler()

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(5):
        model.train()
        total_loss = 0
        with tqdm(train_loader, desc=f"Epoch {epoch+1}/10") as pbar:
            for x_batch, y_batch in pbar:
                users = x_batch[:, 0].to(device)
                items = x_batch[:, 1].to(device)
                y_batch = y_batch.to(device)

                if users.max().item() >= num_users or items.max().item() >= num_items:
                    raise ValueError("⚠️ Найдены user/item_id вне диапазона!")

                with autocast(device_type=device.type):
                    logits = model(users, items)
                    loss = criterion(logits, y_batch)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)

                total_loss += loss.item()
                pbar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

        # --- Валидация ---
        model.eval()
        all_preds = []
        all_true = []

        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                users = x_batch[:, 0].to(device)
                items = x_batch[:, 1].to(device)
                scores = model(users, items)
                preds = (torch.sigmoid(scores) > 0.5).float()
                all_preds.extend(preds.cpu())
                all_true.extend(y_batch.cpu())

        val_recall = recall_score(all_true, all_preds, average='binary')
        print(f"Epoch {epoch+1} | Val Recall@40: {val_recall:.4f}")

        # --- Early Stopping ---
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), BEST_MODEL_PATH)
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            print("Early stopping triggered")
            break

    print("Модель обучена и сохранена.")
    return model, device

model, device = train_model_with_validation(grouped, num_users, num_items)

# ==============================
# 👀 build_seen_nodes — сбор информации о просмотренных нодах
# ==============================
def build_seen_nodes(clickstream):
    seen = defaultdict(set)
    for _, row in clickstream.iterrows():
        cookie = row['cookie']
        node = row['node']
        if pd.notna(cookie) and pd.notna(node):
            seen[cookie].add(str(node))  # Приведение к строке для стабильности
    return seen
seen_dict = build_seen_nodes(data['clickstream'])

# ==============================
# 🔄 Рекомендация с кэшированием
# ==============================
def recommend_for_users_resumable(model, device, test_users, all_nodes, le_user, seen_dict, top_k=40):
    model.eval()
    try:
        all_nodes = [str(node) for node in all_nodes]
        item_ids = torch.arange(len(all_nodes), device=device)

        # --- Item эмбеддинги ---
        if os.path.exists(ITEM_EMBEDDINGS_FILE):
            print("Загрузка item эмбеддингов из кэша...")
            with open(ITEM_EMBEDDINGS_FILE, 'rb') as f:
                item_embeddings = pickle.load(f)
        else:
            print("Вычисление item эмбеддингов...")
            with torch.no_grad():
                item_embeddings = model.get_item_vector(item_ids).cpu().numpy()
            with open(ITEM_EMBEDDINGS_FILE, 'wb') as f:
                pickle.dump(item_embeddings, f)

        # --- Подготовка тестовых пользователей ---
        valid_cookies = []
        encoded_ids = []
        for _, row in test_users.iterrows():
            cookie = row['cookie']
            if cookie in le_user.classes_:
                try:
                    encoded_id = int(le_user.transform([cookie])[0])
                    encoded_ids.append(encoded_id)
                    valid_cookies.append(str(cookie))
                except Exception as e:
                    print(f"Ошибка кодирования cookie {cookie}: {e}")

        processed_cookies = set()
        predictions = []

        # --- Возобновление предсказаний ---
        if os.path.exists(PREDICTIONS_FILE):
            try:
                df_prev = pd.read_csv(PREDICTIONS_FILE)
                predictions = df_prev.values.tolist()
                print(f"Загружено {len(df_prev)} записей из предыдущего запуска.")
                processed_cookies.update(df_prev['cookie'].astype(str).unique())
            except Exception as e:
                print(f"Ошибка загрузки предыдущих предсказаний: {e}")

        remaining = [(c, e) for c, e in zip(valid_cookies, encoded_ids) if c not in processed_cookies]
        print(f"Осталось обработать: {len(remaining)} пользователей")

        BATCH_SIZE = 8192
        start = time.time()

        for i in range(0, len(remaining), BATCH_SIZE):
            batch = remaining[i:i+BATCH_SIZE]
            cookies_batch, encoded_batch = zip(*batch)
            user_tensor = torch.LongTensor(encoded_batch).to(device)

            with torch.no_grad():
                user_vectors = model.get_user_vector(user_tensor).cpu().numpy()

            scores = user_vectors @ item_embeddings.T

            for idx, cookie in enumerate(cookies_batch):
                ranked = [
                    (all_nodes[j], float(scores[idx, j]))
                    for j in np.argsort(-scores[idx])
                    if all_nodes[j] not in seen_dict.get(cookie, set())
                ]
                for node, score in ranked[:top_k]:
                    # ✅ Теперь сохраняем в формате: node, cookie, score
                    predictions.append([str(node), str(cookie), score])
                processed_cookies.add(cookie)

            if (i // BATCH_SIZE) % 10 == 0:
                pd.DataFrame(predictions, columns=['node', 'cookie', 'score']).to_csv(PREDICTIONS_FILE, index=False)
                elapsed = time.time() - start
                print(f"[{i + len(batch):>6}/{len(valid_cookies)}] сохранено... [Время: {elapsed:.2f} сек.]")

        print("Инференс завершён.")
        return pd.DataFrame(predictions, columns=['node', 'cookie', 'score'])

    except Exception as e:
        print(f"Ошибка рекомендации: {e}")
        raise
        
submission = recommend_for_users_resumable(
            model=model,
            device=device,
            test_users=data['test_users'],
            all_nodes=data['cat_features']['node'].unique(),
            le_user=le_user,
            seen_dict=seen_dict,
            top_k=40
        )

submission['node'] = submission['node'].astype(int)
submission['cookie'] = submission['cookie'].astype(int)

# ==============================
# 💾 Сохранение результата
# ==============================
def save_submission(df, path="submission.csv"):
    try:
        # Убедитесь, что колонки в нужном порядке
        df[['node', 'cookie', 'score']].to_csv(path, index=False)
        print(f"Результат сохранён в {path}")
    except Exception as e:
        print(f"Ошибка сохранения результатов: {e}")
        raise

save_submission(submission)

[2025-05-01 23:42:04] Загрузка 10% данных...
[2025-05-01 23:42:10] Формирование пар (user, node)...
[2025-05-01 23:42:11] Отладка: оставляется 10% данных...
[2025-05-01 23:42:11] Кодирование пользователей и товаров...
[2025-05-01 23:42:12] num_users: 81549, num_items: 201806
[2025-05-01 23:42:12] Максимальный user_id: 81548, Максимальный item_id: 201805
[2025-05-01 23:42:12] Используется устройство: cuda
[2025-05-01 23:42:12] Загрузка обученной модели из кэша...
[2025-05-01 23:44:19] Загрузка item эмбеддингов из кэша...
[2025-05-01 23:44:21] Загружено 217200 записей из предыдущего запуска.
[2025-05-01 23:44:21] Осталось обработать: 0 пользователей
[2025-05-01 23:44:21] Инференс завершён.
[2025-05-01 23:44:21] Результат сохранён в submission.csv
