# Обучение определения фейковых фактов о COVID и вакцинации

In [40]:
import math

import torch
import pandas as pd
import numpy as np

In [41]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [42]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [43]:
MODEL_NAME = "roberta-base"
TRAIN_DF_NAME = "covid_vaccine_fake_clear.xlsx"
MAX_LENGTH = 128
BATCH_SIZE = 128

# Датасет

In [None]:
data_df = pd.read_excel(DATA_PATH / TRAIN_DF_NAME)
data_df.head(1)

In [None]:
import pandas as pd

def balance_label_idx(data_df: pd.DataFrame, target_label: int = 2, target_count: int = 30000, random_state: int = 42) -> pd.DataFrame:
    """
    Оставляет все категории label_idx, но уменьшает количество target_label до target_count случайным образом.
    
    :param data_df: Исходный DataFrame.
    :param target_label: Значение label_idx, которое нужно уменьшить (по умолчанию 2).
    :param target_count: Количество строк, которое нужно оставить для target_label (по умолчанию 30 000).
    :param random_state: Фиксированный seed для воспроизводимости выборки.
    :return: Обновленный DataFrame.
    """
    # Оставляем все строки, кроме тех, у которых label_idx == target_label
    df_other_labels = data_df[data_df["label_idx"] != target_label]
    
    # Выбираем случайные target_count строк, где label_idx == target_label
    df_target_label = data_df[data_df["label_idx"] == target_label].sample(n=target_count, random_state=random_state)
    
    # Объединяем обратно
    balanced_df = pd.concat([df_other_labels, df_target_label])

    # Перемешиваем строки и сбрасываем индексы
    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return balanced_df

# data_df = balance_label_idx(data_df)

In [45]:
data_df = data_df.fillna("")

for col in data_df.select_dtypes(include=["object", "bool"]).columns:
    data_df[col] = data_df[col].astype(str)

In [46]:
label2idx = {key: int(idx) for idx, key in enumerate(data_df['label_str'].unique())}

idx2label = dict([(v, k) for k, v in label2idx.items()])

data_df['label_idx'] = data_df['label_str'].apply(lambda x: label2idx[x])

In [None]:
idx2label

In [None]:
NUM_CLASSES = len(idx2label)
NUM_CLASSES

In [None]:
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(data_df, test_size=0.1, stratify=data_df["label_idx"], random_state=42, shuffle=True)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, stratify=train_val_df["label_idx"], random_state=42, shuffle=True)

print(f"Размер тренировочного набора: {len(train_df)}")
print(f"Размер валидационного набора: {len(val_df)}")
print(f"Размер тестового набора: {len(test_df)}")

In [50]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizer
import pandas as pd
from typing import Dict, Tuple


class TokenizedDataset(Dataset):
    def __init__(
        self, 
        dataframe: pd.DataFrame, 
        tokenizer: PreTrainedTokenizer, 
        max_length: int, 
        tensor_dtype: Tuple[torch.dtype, torch.dtype, torch.dtype] = (torch.long, torch.long, torch.long)
    ):
        """
        Инициализация датасета с токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками "text" и "label".
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            tensor_dtype (tuple): Типы данных для токенов и меток.
        """
        self.tensor_dtype = tensor_dtype

        # Токенизация данных
        tokenized_data = tokenizer(
            dataframe["text"].tolist(),
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        self.input_ids = tokenized_data["input_ids"].to(dtype=self.tensor_dtype[0])
        self.attention_mask = tokenized_data["attention_mask"].to(dtype=self.tensor_dtype[1])
        self.labels = torch.tensor(dataframe["label_idx"].tolist(), dtype=self.tensor_dtype[2])

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.labels)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные и метки.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами и меткой.
        """
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }


In [51]:
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizer
import pandas as pd


def create_dataloader(
    dataframe: pd.DataFrame,
    tokenizer: PreTrainedTokenizer,
    max_length: int = 64,
    batch_size: int = 16,
    shuffle: bool = True,
    tensor_dtype=(torch.long, torch.long, torch.long),
) -> DataLoader:
    """
    Создание DataLoader из DataFrame.

    Args:
        dataframe (pd.DataFrame): DataFrame с колонками "text" и "label".
        tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
        max_length (int): Максимальная длина токенов.
        batch_size (int): Размер батча.
        shuffle (bool): Перемешивать ли данные.
        tensor_dtype (tuple): Типы данных для токенов и меток.

    Returns:
        DataLoader: DataLoader для работы с моделью.
    """
    dataset = TokenizedDataset(dataframe, tokenizer, max_length, tensor_dtype=tensor_dtype)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
    )
    return dataloader


In [None]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, cache_dir=DATA_CACHE)

train_loader = create_dataloader(
    dataframe=train_df,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = create_dataloader(
    dataframe=val_df,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = create_dataloader(
    dataframe=test_df,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Выводим размер набора данных
print(f"Размер тренировочного набора: {len(train_loader.dataset)}")
print(f"Размер валидационного набора: {len(val_loader.dataset)}")
print(f"Размер тестового набора: {len(test_loader.dataset)}")


# Модель

In [53]:
from typing import Tuple, Dict, Any

import torch
from torch.utils.data import DataLoader
from torch.optim import Optimizer
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score


class VaccineFakeClassifierTrainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        optimizer: Optimizer,
        criterion: torch.nn.Module,
        device: torch.device,
    ):
        """
        Инициализация класса Trainer.

        Args:
            model: Модель для обучения (e.g., RobertaForSequenceClassification).
            train_loader: DataLoader для обучающего набора.
            val_loader: DataLoader для валидационного набора.
            optimizer: Оптимизатор.
            criterion: Функция потерь.
            device: Устройство ('cuda' или 'cpu').
        """
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.history = {
            "train_loss": [],
            "val_loss": [],
            "train_metrics": [],
            "val_metrics": []
        }

        self.model.to(self.device)

    def train_epoch(self) -> Tuple[float, Dict[str, float]]:
        """
        Обучение модели за одну эпоху.

        Returns:
            Средние потери и метрики за эпоху.
        """
        self.model.train()
        running_loss = 0.0
        all_labels = []
        all_preds = []

        for batch in tqdm(self.train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)
            labels = batch["labels"].to(self.device)

            self.optimizer.zero_grad()

            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            running_loss += loss.item()

            loss.backward()
            self.optimizer.step()

            preds = logits.argmax(dim=-1)
            all_labels.append(labels.cpu())
            all_preds.append(preds.cpu())

        all_labels = torch.cat(all_labels)
        all_preds = torch.cat(all_preds)
        metrics = self._compute_metrics(all_preds, all_labels)

        epoch_loss = running_loss / len(self.train_loader)
        return epoch_loss, metrics

    def validate_epoch(self) -> Tuple[float, Dict[str, float]]:
        """
        Валидация модели за одну эпоху.

        Returns:
            Средние потери и метрики за эпоху.
        """
        self.model.eval()
        running_loss = 0.0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation"):
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                running_loss += loss.item()

                preds = logits.argmax(dim=-1)
                all_labels.append(labels.cpu())
                all_preds.append(preds.cpu())

        all_labels = torch.cat(all_labels)
        all_preds = torch.cat(all_preds)
        metrics = self._compute_metrics(all_preds, all_labels)

        epoch_loss = running_loss / len(self.val_loader)
        return epoch_loss, metrics

    def fit(self, num_epochs: int):
        """
        Обучение и валидация модели.

        Args:
            num_epochs: Общее количество эпох.
        """
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")

            train_loss, train_metrics = self.train_epoch()
            val_loss, val_metrics = self.validate_epoch()

            self.history["train_loss"].append(train_loss)
            self.history["val_loss"].append(val_loss)
            self.history["train_metrics"].append(train_metrics)
            self.history["val_metrics"].append(val_metrics)

            print(f"Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}")
            print(f"Train Metrics: {train_metrics} | Validation Metrics: {val_metrics}")

    def plot_results(self, metrics_to_plot=None):
        """
        Построение графиков потерь и метрик для обучения и валидации.

        Args:
            metrics_to_plot: Список метрик для визуализации.
        """
        if metrics_to_plot is None:
            metrics_to_plot = ["accuracy", "f1"]

        num_plots = len(metrics_to_plot) + 1
        plt.figure(figsize=(15, 5 * (num_plots // 2 + 1)))

        # График потерь
        plt.subplot((num_plots + 1) // 2, 2, 1)
        plt.plot(self.history["train_loss"], label="Train Loss", color="blue", linestyle="--")
        plt.plot(self.history["val_loss"], label="Validation Loss", color="red", linestyle="-")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.title("Loss over Epochs")
        plt.legend()
        plt.grid(True)

        # Графики метрик
        for i, metric in enumerate(metrics_to_plot, start=2):
            train_metric = [m[metric] for m in self.history["train_metrics"]]
            val_metric = [m[metric] for m in self.history["val_metrics"]]

            plt.subplot((num_plots + 1) // 2, 2, i)
            plt.plot(train_metric, label=f"Train {metric.capitalize()}", color="blue", linestyle="--")
            plt.plot(val_metric, label=f"Validation {metric.capitalize()}", color="red", linestyle="-")
            plt.xlabel("Epochs")
            plt.ylabel(metric.capitalize())
            plt.title(f"{metric.capitalize()} over Epochs")
            plt.legend()
            plt.grid(True)

        plt.tight_layout()
        plt.show()

    @staticmethod
    def _compute_metrics(preds: torch.Tensor, labels: torch.Tensor) -> Dict[str, float]:
        """
        Вычисление метрик.

        Args:
            preds: Предсказания модели.
            labels: Истинные метки.

        Returns:
            Словарь метрик.
        """
        preds = preds.numpy()
        labels = labels.numpy()

        metrics = {
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted"),
        }
        return metrics

# Обучение

In [None]:
import torch
from torch.optim import AdamW
from transformers import RobertaForSequenceClassification

LEARNING_RATE = 1e-5
NUM_EPOCHS = 5
WEIGHT_DECAY = 0.01

model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_CLASSES, cache_dir=DATA_CACHE)

model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

criterion = torch.nn.CrossEntropyLoss()

In [64]:
from torch.optim import Adagrad, RMSprop

optimizer = Adagrad(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

optimizer = RMSprop(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, alpha=0.9, momentum=0.9)

In [65]:
trainer = VaccineFakeClassifierTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=DEVICE,
)

In [None]:
trainer.fit(num_epochs=NUM_EPOCHS)

In [None]:
trainer.plot_results()

In [None]:
model_save_path = DATA_PATH_SAVE_MODELS / "covid_vaccine_fake_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Модель и токинайзер сохранены в: {model_save_path}")

## Энтропия (уверенность модели)

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_curve, roc_auc_score
from torch.utils.data import DataLoader
from typing import Dict, Any


def compute_entropy_thresholds(model: torch.nn.Module, dataloader: DataLoader, device: torch.device) -> Dict[str, Any]:
    """
    Вычисляет энтропию предсказаний модели для каждой категории и находит оптимальные пороги.
    
    Args:
        model (torch.nn.Module): Обученная модель для классификации.
        dataloader (DataLoader): Валидационный датасет.
        device (torch.device): CUDA или CPU.

    Returns:
        Dict[str, Any]: Содержит оптимальные пороги энтропии, ROC AUC и DataFrame с результатами.
    """
    model.to(device)
    model.eval()

    # Словари для хранения энтропии и корректности предсказаний
    entropies_per_class = defaultdict(list)
    correct_per_class = defaultdict(list)

    # Сбор данных
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = F.softmax(outputs.logits, dim=1)

            # Энтропия предсказания
            entropy = -torch.sum(probabilities * torch.log2(probabilities + 1e-15), dim=1)
            predicted_labels = torch.argmax(probabilities, dim=1)
            correct = (predicted_labels == labels).float()

            # Сохраняем значения по классам
            for i in range(len(labels)):
                label = labels[i].item()
                entropies_per_class[label].append(entropy[i].item())
                correct_per_class[label].append(correct[i].item())

    # Определение оптимального порога энтропии
    optimal_thresholds = {}
    roc_aucs = {}
    all_entropies = []
    all_correct = []
    results = []

    plt.figure(figsize=(12, 8))

    for label, entropies in entropies_per_class.items():
        correct = np.array(correct_per_class[label])
        entropies = np.array(entropies)

        if len(np.unique(correct)) < 2:
            optimal_thresholds[label] = None
            roc_aucs[label] = None
            results.append({'label': label, 'AUC': None, 'optimal_threshold': None})
            continue

        # ROC и AUC
        fpr, tpr, thresholds = roc_curve(correct, -entropies)
        roc_auc = roc_auc_score(correct, -entropies)

        # Оптимальный порог
        youden_j = tpr - fpr
        optimal_idx = np.argmax(youden_j)
        optimal_threshold = -thresholds[optimal_idx]

        optimal_thresholds[label] = optimal_threshold
        roc_aucs[label] = roc_auc

        all_entropies.extend(entropies)
        all_correct.extend(correct)

        # Сохранение результатов
        results.append({'label': label, 'AUC': roc_auc, 'optimal_threshold': optimal_threshold})

        # Добавляем кривую на график
        plt.plot(fpr, tpr, label=f'Class {label} (AUC = {roc_auc:.4f})')
        plt.scatter(fpr[optimal_idx], tpr[optimal_idx], color='red')

    # Общий порог энтропии
    all_entropies = np.array(all_entropies)
    all_correct = np.array(all_correct)
    fpr, tpr, thresholds = roc_curve(all_correct, -all_entropies)
    roc_auc = roc_auc_score(all_correct, -all_entropies)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    overall_optimal_threshold = -thresholds[optimal_idx]

    print(f"Общий оптимальный порог энтропии: {overall_optimal_threshold:.4f}")
    print(f"AUC ROC (общий): {roc_auc:.4f}")

    # Добавляем общую ROC-кривую
    plt.plot(fpr, tpr, label=f'Overall (AUC = {roc_auc:.4f})', linestyle='--', color='black')
    plt.scatter(fpr[optimal_idx], tpr[optimal_idx], color='blue', label='Overall Optimal')

    # Настройки графика
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Each Class and Overall')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

    # Создание DataFrame с результатами
    results_df = pd.DataFrame(results)

    return {
        'overall_optimal_threshold': overall_optimal_threshold,
        'optimal_thresholds': optimal_thresholds,
        'roc_aucs': roc_aucs,
        'results_df': results_df
    }


In [None]:
entropy_dict = compute_entropy_thresholds(model, val_loader, DEVICE)

print(entropy_dict["results_df"])


In [None]:
entropy_dict["optimal_thresholds"]

# Тестирование

In [69]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F
from typing import Dict


def test_model(
    model: torch.nn.Module,
    test_loader: DataLoader,
    test_df: pd.DataFrame,
    idx2label: Dict[int, str],
    entropy_thresholds: Dict[int, float],  # Пороги энтропии по категориям
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Тестирует модель на тестовом DataLoader и возвращает DataFrame с результатами.

    Args:
        model (torch.nn.Module): Обученная модель.
        test_loader (DataLoader): DataLoader для тестового набора.
        test_df (pd.DataFrame): Исходный DataFrame тестовых данных.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        entropy_thresholds (dict): Оптимальные пороги энтропии по категориям.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с результатами, содержащий:
            - 'text': текст примера,
            - 'true_label': истинная метка,
            - 'predicted_label': предсказанная метка,
            - 'probability': вероятность предсказания,
            - 'correct': корректность предсказания (True/False),
            - 'entropy': энтропия предсказания,
            - 'entropy_threshold': порог энтропии для категории,
            - 'passed_threshold': прошло ли предсказание порог энтропии.
    """
    model.eval()
    model.to(device)

    test_results = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            true_labels = batch["labels"].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            probabilities = torch.softmax(logits, dim=-1)

            # Энтропия предсказания
            entropy = -torch.sum(probabilities * torch.log2(probabilities + 1e-15), dim=1)

            predictions = torch.argmax(probabilities, dim=-1).cpu().numpy()
            true_labels = true_labels.cpu().numpy()
            entropies = entropy.cpu().numpy()
            probabilities = probabilities.cpu().numpy()

            for true_label, predicted_label, probs, entropy_value in zip(true_labels, predictions, probabilities, entropies):
                entropy_threshold = entropy_thresholds.get(predicted_label, None)
                passed_threshold = entropy_value > entropy_threshold if entropy_threshold is not None else None

                test_results.append({
                    "true_label": idx2label[true_label],
                    "predicted_label": idx2label[predicted_label],
                    "probability": probs.tolist(),
                    "correct": true_label == predicted_label,
                    "entropy": entropy_value,
                    "entropy_threshold": entropy_threshold,
                    "passed_threshold": passed_threshold,
                })

    test_results_df = pd.DataFrame(test_results)

    test_df = test_df.reset_index(drop=True)
    test_df = pd.concat([test_df, test_results_df], axis=1)

    accuracy = accuracy_score(test_df["true_label"], test_df["predicted_label"])
    f1 = f1_score(test_df["true_label"], test_df["predicted_label"], average="weighted")

    print("\n=== Результаты тестирования ===")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1:.4f}")

    return test_df


In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / "covid_vaccine_fake_model")
model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / "covid_vaccine_fake_model")

model.to(DEVICE)

In [None]:
test_results_df = test_model(
    model=model,
    test_loader=test_loader,
    test_df=test_df,
    idx2label=idx2label,
    entropy_thresholds=entropy_dict["optimal_thresholds"],
    device=DEVICE,
)

In [None]:
test_results_df.sample(10)

In [73]:
test_results_df.to_excel(DATA_PATH / 'test_results_learning.xlsx', index=False)

In [74]:
from transformers import PreTrainedTokenizer, PreTrainedModel
import torch


def test_model_with_text(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    text: str,
    max_length: int = 128,
    label_map: dict = None,
):
    """
    Проверяет модель на введенном вручную тексте, используя argmax для классификации.

    Args:
        model (PreTrainedModel): Загруженная обученная модель (например, RobertaForSequenceClassification).
        tokenizer (PreTrainedTokenizer): Токенизатор для подготовки текста.
        text (str): Текст для классификации.
        max_length (int): Максимальная длина токенизированного текста.
        label_map (dict): Словарь для отображения меток, если метки числовые.

    Returns:
        dict: Словарь с текстом, вероятностями и предсказанными метками.
    """

    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]  # Вероятности для каждого класса

    predicted_index = torch.argmax(logits, dim=-1).item()
    predicted_label = label_map[predicted_index] if label_map else predicted_index

    # Результаты
    result = {
        "text": text,
        "probabilities": probs.tolist(),
        "predicted_label": predicted_label
    }

    return result


Real

In [None]:
text = "COVID-19 vaccines do not affect fertility or reproductive health. This has been confirmed through multiple studies and ongoing research."

result = test_model_with_text(
    model=model,
    tokenizer=tokenizer,
    text=text,
    max_length=128,
    label_map=idx2label
)

print(result)

Fake

In [None]:
text = "COVID-19 vaccines affect fertility or reproductive health."

result = test_model_with_text(
    model=model,
    tokenizer=tokenizer,
    text=text,
    max_length=128,
    label_map=idx2label
)

print(result)

In [None]:
text = "Arginine gave me a powerful boost of energy, I can really feel how it pumped me up. In the gym, it’s way easier to handle the weights, and my endurance is through the roof. It feels like my muscles are growing before my eyes, and the post-workout feeling is just awesome. I’m enjoying every moment as I feel my body reacting to the load. It’s definitely a great motivation to keep pushing forward! covid ruined everything though"

result = test_model_with_text(
    model=model,
    tokenizer=tokenizer,
    text=text,
    max_length=128,
    label_map=idx2label
)

print(result)

text = '''
The new strain of the Omicron coronavirus was first detected in South Africa and Botswana in November 2021. It is rapidly spreading all over the world. The omicron strain contains more than 30 mutations in the S-protein spike on the virus shell, with which it enters the cell.
'''
result = test_model_with_text(
    model=model,
    tokenizer=tokenizer,
    text=text,
    max_length=128,
    label_map=idx2label
)

print(result)