In [None]:
%pip install datasets



In [None]:
# Загрузка токенизатора и модели
import torch
from transformers import AutoTokenizer, AutoModel
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16).to(device)

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import Dataset
import re
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("ashraq/movielens_ratings", split="train")

n = 10000
small_dataset = dataset.select(range(n))
train_dataset = small_dataset.select(range(int(0.8 * n)))
eval_dataset = small_dataset.select(range(int(0.8 * n), n))


from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import Dataset


dataset = load_dataset("ashraq/movielens_ratings", split="train")

n = 1000
indices = list(range(n))
train_indices = indices[:int(0.8 * n)]
eval_indices = indices[int(0.8 * n):]

small_dataset = dataset.select(indices)
train_dataset = small_dataset.select(train_indices)
eval_dataset = small_dataset.select(eval_indices)


class MovieLensHFDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = [(i, j) for i in range(len(dataset)) for j in range(len(dataset))]


    def __getitem__(self, idx):
        idx1, idx2 = self.pairs[idx]
        row1 = self.dataset[idx1]
        row2 = self.dataset[idx2]

        user_id1 = row1["user_id"]
        movie_title1 = row1["title"]
        genres1 = row1["genres"]
        rating1 = row1["rating"]

        user_id2 = row2["user_id"]
        movie_title2 = row2["title"]
        genres2 = row2["genres"]
        rating2 = row2["rating"]

        prompt1 = (
            f"Movie 1\n"
            f"User ID: {user_id1}\n"
            f"Movie title: {movie_title1}\n"
            f"Genres: {genres1}\n"
        )

        prompt2 = (
            "Movie 2\n"
            f"User ID: {user_id2}\n"
            f"Movie title: {movie_title2}\n"
            f"Genres: {genres2}\n"
        )

        target = 1 if rating1 >= rating2 else 0

        inputs1 = self.tokenizer(
            prompt1,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        ).to(device)

        inputs2 = self.tokenizer(
            prompt2,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        ).to(device)

        x1 = {key: val.squeeze(0) for key, val in inputs1.items() if isinstance(key, (int, float, str))}
        x2 = {key: val.squeeze(0) for key, val in inputs2.items() if isinstance(key, (int, float, str))}


        return x1, x2, torch.tensor(target).to(device)


    def __len__(self):
        # assuming all possible pairs could be generated
        return len(self.dataset) * len(self.dataset)


train = MovieLensHFDataset(train_dataset, tokenizer)
val = MovieLensHFDataset(eval_dataset, tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)




In [None]:
import wandb


wandb.init(
    # set the wandb project where this run will be logged
    project="NLP project",
    reinit=True,
    # track hyperparameters and run metadata,
)

In [None]:
import torch
import torch.nn as nn

class RankingModel(nn.Module):
    def __init__(self, hidden_size=768, num_classes=1):
        super(RankingModel, self).__init__()
        self.dense = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.dense(x)


rank_model = RankingModel().to(device)
criterion = nn.MarginRankingLoss()

In [None]:
raise

RuntimeError: No active exception to reraise

In [None]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.autograd import Variable
from transformers import Trainer, BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm


class RankingTrainer:
    def __init__(self, base_model, rank_model, criterion, optimizer_cls=Adam, lr=0.00001):
        self.base_model = base_model
        self.rank_model = rank_model
        self.criterion = criterion
        self.optimizer = optimizer_cls(self.rank_model.parameters(), lr=lr)

    def train(self, train_dataset, validation_dataset, epochs, batch_size):
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
        self.rank_model.train()
        for epoch in range(epochs):
            total_loss = 0
            for x1, x2, y_true in tqdm(train_dataloader):
                self.optimizer.zero_grad()
                with torch.no_grad():
                    output1, output2 = self.base_model(**x1), self.base_model(**x2)
                y1, y2 = output1.last_hidden_state[:, 0, :].to(torch.float32), output2.last_hidden_state[:, 0, :].to(torch.float32)
                y1, y2 = self.rank_model(y1)[:, 0], self.rank_model(y2)[:, 0]
                loss = self.criterion(y1, y2, y_true)
                loss.backward()
                total_loss += loss.item()
                self.optimizer.step()
            avg_loss = total_loss / len(train_dataloader)
            print(f'Training loss at epoch {epoch}: {avg_loss}')
            wandb.log({
                "epoch": "epoch",
                "train_loss": avg_loss,
            })
            self.evaluate(validation_dataloader)


    def evaluate(self, dataloader):
        self.rank_model.eval()
        total_loss = 0
        pred_scores = []
        true_scores = []
        with torch.no_grad():
            for x1, x2, y_true in tqdm(dataloader):
                output1, output2 = self.base_model(**x1), self.base_model(**x2)
                y1, y2 = output1.last_hidden_state[:, 0, :].to(torch.float32), output2.last_hidden_state[:, 0, :].to(torch.float32)
                y1, y2 = self.rank_model(y1)[:, 0], self.rank_model(y2)[:, 0]
                pred_scores.extend((y1-y2).cpu().numpy())
                true_scores.extend(y_true.cpu().numpy())

        pred_labels = [1 if score > 0 else -1 for score in pred_scores]

        accuracy = accuracy_score(true_scores, pred_labels)
        precision = precision_score(true_scores, pred_labels)
        recall = recall_score(true_scores, pred_labels)
        f1 = f1_score(true_scores, pred_labels)
        roc_auc = roc_auc_score(true_scores, pred_scores)
        print(f'Evaluation Results - Accuracy: {accuracy}, Precision: {precision}, Recall:{recall}, F1: {f1}, ROC-AUC: {roc_auc}')

        wandb.log({
            "val_accuracy": accuracy,
            "val_precision": precision,
            "val_recall": recall,
            "val_f1": f1,
            "val_roc_auc": roc_auc
        })




rank_trainer = RankingTrainer(model, rank_model, criterion)
rank_trainer.train(train, val, epochs=3, batch_size=256)

100%|██████████| 2500/2500 [25:02<00:00,  1.66it/s]


Training loss at epoch 0: 0.004201131435763091


100%|██████████| 157/157 [01:33<00:00,  1.68it/s]


Evaluation Results - Accuracy: 0.504875, Precision: 0.5839187345226664, Recall:0.49961082763988585, F1: 0.538484841423345, ROC-AUC: 0.5040365670824895


100%|██████████| 2500/2500 [24:51<00:00,  1.68it/s]


Training loss at epoch 1: 6.019460805691778e-06


100%|██████████| 157/157 [01:33<00:00,  1.68it/s]


Evaluation Results - Accuracy: 0.5204, Precision: 0.5913515016685206, Recall:0.5517166825218369, F1: 0.5708469419712765, ROC-AUC: 0.5161560087862863


100%|██████████| 2500/2500 [24:49<00:00,  1.68it/s]


Training loss at epoch 2: 2.2266721352934837e-07


100%|██████████| 157/157 [01:33<00:00,  1.69it/s]


Evaluation Results - Accuracy: 0.46895, Precision: 0.548631905007744, Recall:0.45952607454812766, F1: 0.5001411897590361, ROC-AUC: 0.45289448435551505


In [None]:
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import wandb  # Убедитесь, что wandb импортирован, если вы его используете

class CEPairwiseTrainer:
    def __init__(self, encoder, ranker, lr=1e-5, device="cpu"):
        self.encoder = encoder.to(device)
        self.ranker = ranker.to(device)
        self.criterion = BCEWithLogitsLoss()
        self.optimizer = Adam(self.ranker.parameters(), lr=lr)
        self.device = device

    def train(self, train_dataset, val_dataset, epochs=1, batch_size=32):
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        self.encoder.eval()
        self.ranker.train()
        for epoch in range(epochs):
            total_loss = 0.0
            for x1, x2, y_true in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
                self.optimizer.zero_grad()
                x1 = {k: v.to(self.device) for k, v in x1.items()}
                x2 = {k: v.to(self.device) for k, v in x2.items()}
                y_true = y_true.to(self.device).float()  # (batch,) 0.0 or 1.0

                with torch.no_grad():
                    h1 = self.encoder(**x1).last_hidden_state[:, 0, :]  # [CLS]
                    h2 = self.encoder(**x2).last_hidden_state[:, 0, :]
                target_dtype = next(self.ranker.parameters()).dtype
                target_device = next(self.ranker.parameters()).device
                h1 = h1.to(dtype=target_dtype, device=target_device)
                h2 = h2.to(dtype=target_dtype, device=target_device)

                s1 = self.ranker(h1)[:, 0]
                s2 = self.ranker(h2)[:, 0]

                logits = s1 - s2
                loss = self.criterion(logits, y_true)
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            avg_loss = total_loss / len(train_loader)
            print(f'[Epoch {epoch+1}] train loss: {avg_loss:.4f}')
            self.evaluate(val_loader)

    def evaluate(self, data_loader):
        self.encoder.eval()
        self.ranker.eval()
        all_scores = []
        all_labels = []
        all_preds = []  # Добавляем список для накопления всех предсказаний
        total_loss = 0.0
        with torch.no_grad():
            for x1, x2, y_true in tqdm(data_loader, desc="Evaluating"):
                x1 = {k: v.to(self.device) for k, v in x1.items()}
                x2 = {k: v.to(self.device) for k, v in x2.items()}
                y_true = y_true.to(self.device).float()  # (batch,)

                h1 = self.encoder(**x1).last_hidden_state[:, 0, :]
                h2 = self.encoder(**x2).last_hidden_state[:, 0, :]

                target_dtype = next(self.ranker.parameters()).dtype
                target_device = next(self.ranker.parameters()).device
                h1 = h1.to(dtype=target_dtype, device=target_device)
                h2 = h2.to(dtype=target_dtype, device=target_device)

                s1 = self.ranker(h1)[:, 0]
                s2 = self.ranker(h2)[:, 0]
                logits = s1 - s2

                loss = self.criterion(logits, y_true)
                total_loss += loss.item()
                probs = torch.sigmoid(logits)
                batch_preds = (probs > 0.5).long().cpu().numpy()

                all_scores.extend(probs.cpu().numpy())
                all_labels.extend(y_true.cpu().numpy())
                all_preds.extend(batch_preds)  # Накапливаем предсказания из всех батчей

        # Используем накопленные предсказания вместо preds из последнего батча
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, zero_division=0, average='binary', pos_label=1)
        recall = recall_score(all_labels, all_preds, zero_division=0, average='binary', pos_label=1)
        f1 = f1_score(all_labels, all_preds, zero_division=0, average='binary', pos_label=1)
        try:
            roc_auc = roc_auc_score(all_labels, all_scores)
        except:
            roc_auc = float('nan')
        avg_loss = total_loss / len(data_loader)
        print(f'Val: Loss={avg_loss:.4f} Acc={accuracy:.4f} Prec={precision:.4f} Rec={recall:.4f} F1={f1:.4f} ROC_AUC={roc_auc:.4f}')

        # Убедитесь, что wandb импортирован, если вы используете этот код
        wandb.log({
            "val_accuracy": accuracy,
            "val_precision": precision,
            "val_recall": recall,
            "val_f1": f1,
            "val_roc_auc": roc_auc
        })

# Пример инициализации:
trainer = CEPairwiseTrainer(model, rank_model, lr=1e-5, device="cuda")
trainer.train(train, val, epochs=3, batch_size=256)


Epoch 1: 100%|██████████| 2500/2500 [25:23<00:00,  1.64it/s]


[Epoch 1] train loss: 0.6888


Evaluating: 100%|██████████| 157/157 [01:35<00:00,  1.64it/s]


Val: Loss=0.6734 Acc=0.6175 Prec=0.6458 Rec=0.7494 F1=0.6938 ROC_AUC=0.6412


Epoch 2: 100%|██████████| 2500/2500 [25:29<00:00,  1.63it/s]


[Epoch 2] train loss: 0.6511


Evaluating: 100%|██████████| 157/157 [01:34<00:00,  1.66it/s]


Val: Loss=0.6365 Acc=0.6451 Prec=0.6576 Rec=0.8055 F1=0.7241 ROC_AUC=0.6778


Epoch 3: 100%|██████████| 2500/2500 [25:23<00:00,  1.64it/s]


[Epoch 3] train loss: 0.6000


Evaluating: 100%|██████████| 157/157 [01:34<00:00,  1.67it/s]


Val: Loss=0.6564 Acc=0.6258 Prec=0.6505 Rec=0.7622 F1=0.7020 ROC_AUC=0.6472


In [None]:
for x1, x2, y_true in tqdm(val):
    print(y_true)
    break

  0%|          | 0/40000 [00:00<?, ?it/s]

tensor(1, device='cuda:0')



