# Обучение определения фейковых фактов о COVID и вакцинации

In [27]:
import math

import torch
import pandas as pd
import numpy as np

In [28]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [29]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [30]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 16

# Датасет

In [31]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME)
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,,,,,,,,189.33,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),


In [32]:
# data_df = data_df.sample(1000)

In [33]:
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

In [34]:
from typing import Dict, Tuple
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
import pandas as pd


class TokenizedDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        max_length: int,
        text_column: str = "text",
        link_text_column: str = "link_text",
        tensor_dtype: Tuple[torch.dtype, torch.dtype] = (torch.long, torch.long),
    ):
        """
        Инициализация датасета с ленивой токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками для токенизации.
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            text_column (str): Название основной текстовой колонки.
            link_text_column (str): Название колонки с дополнительным текстом.
            tensor_dtype (tuple): Типы данных для токенов (input_ids, attention_mask).
        """
        self.dataframe = dataframe.copy()  # Копируем DataFrame, чтобы избежать изменений в оригинале
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.link_text_column = link_text_column
        self.tensor_dtype = tensor_dtype

        # Проверка, есть ли указанные колонки в DataFrame
        if text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{text_column}' отсутствует в DataFrame")
        if link_text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{link_text_column}' отсутствует в DataFrame")

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.dataframe)

    def _tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """
        Токенизирует текст, если он не пустой, иначе возвращает тензоры с нулями.

        Args:
            text (str): Текст для токенизации.

        Returns:
            Dict[str, torch.Tensor]: Тензоры input_ids и attention_mask.
        """
        if pd.isna(text) or text.strip() == "":
            return {
                "input_ids": torch.zeros(self.max_length, dtype=self.tensor_dtype[0]),
                "attention_mask": torch.zeros(self.max_length, dtype=self.tensor_dtype[1]),
            }
        else:
            tokens = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": tokens["input_ids"][0].to(dtype=self.tensor_dtype[0]),
                "attention_mask": tokens["attention_mask"][0].to(dtype=self.tensor_dtype[1]),
            }

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами из двух колонок.
        """
        row = self.dataframe.iloc[idx]

        text_tokens = self._tokenize_text(row[self.text_column])

        link_text_tokens = self._tokenize_text(row[self.link_text_column])

        return {
            "input_ids_text": text_tokens["input_ids"],
            "attention_mask_text": text_tokens["attention_mask"],
            "input_ids_link": link_text_tokens["input_ids"],
            "attention_mask_link": link_text_tokens["attention_mask"],
        }


In [35]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME)

dataset = TokenizedDataset(data_df, tokenizer, MAX_LENGTH, text_column='text', link_text_column='link_text')

In [36]:
dataset[8]

{'input_ids_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids_link': tensor([    0, 34141, 39941,   231,    12,   180,    12,   2

# Модель

In [37]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME)

model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Тестирование

In [38]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd
import torch.nn.functional as F
from typing import Dict


def test_model_on_dataset(
    model: torch.nn.Module,
    dataset: TokenizedDataset,
    idx2label: Dict[int, str],
    entropy_thresholds: Dict[int, float],  # Пороги энтропии по категориям
    batch_size: int = 16,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Прогоняет датасет через модель и добавляет предсказания, энтропию и её пороги в DataFrame.

    Args:
        model (torch.nn.Module): Обученная модель.
        dataset (Dataset): Токенизированный датасет.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        entropy_thresholds (dict): Оптимальные пороги энтропии по категориям.
        batch_size (int): Размер батча для DataLoader.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с добавленными предсказаниями, включая:
            - 'predict_1', 'probability_1', 'entropy_1', 'entropy_threshold_1', 'passed_threshold_1'
            - 'predict_2', 'probability_2', 'entropy_2', 'entropy_threshold_2', 'passed_threshold_2'
    """
    model.eval()
    model.to(device)

    # Создаём DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    test_df = dataset.dataframe.copy()

    # Списки для хранения предсказаний и метрик
    predictions_1, probabilities_1, entropies_1, entropy_thresholds_1, passed_thresholds_1 = [], [], [], [], []
    predictions_2, probabilities_2, entropies_2, entropy_thresholds_2, passed_thresholds_2 = [], [], [], [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            batch_size_current = batch["input_ids_text"].shape[0]

            # === Обрабатываем основную колонку (text) ===
            input_ids_text = batch["input_ids_text"].to(device)
            attention_mask_text = batch["attention_mask_text"].to(device)

            text_has_content = [torch.any(input_ids_text[i] != 0).item() for i in range(batch_size_current)]
            if any(text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_text, attention_mask=attention_mask_text).logits
                probs = torch.softmax(logits, dim=-1)
                entropy = -torch.sum(probs * torch.log2(probs + 1e-15), dim=1)

                preds = torch.argmax(probs, dim=-1).cpu().numpy()
                probs = probs.cpu().numpy()
                entropy = entropy.cpu().numpy()

                for i in range(batch_size_current):
                    if text_has_content[i]:
                        entropy_threshold = entropy_thresholds.get(preds[i], None)
                        passed_threshold = entropy[i] < entropy_threshold if entropy_threshold is not None else None

                        predictions_1.append(preds[i])
                        probabilities_1.append(probs[i].tolist())
                        entropies_1.append(entropy[i])
                        entropy_thresholds_1.append(entropy_threshold)
                        passed_thresholds_1.append(passed_threshold)
                    else:
                        predictions_1.append(None)
                        probabilities_1.append(None)
                        entropies_1.append(None)
                        entropy_thresholds_1.append(None)
                        passed_thresholds_1.append(None)
            else:
                predictions_1.extend([None] * batch_size_current)
                probabilities_1.extend([None] * batch_size_current)
                entropies_1.extend([None] * batch_size_current)
                entropy_thresholds_1.extend([None] * batch_size_current)
                passed_thresholds_1.extend([None] * batch_size_current)

            # === Обрабатываем дополнительную колонку (link_text) ===
            input_ids_link = batch["input_ids_link"].to(device)
            attention_mask_link = batch["attention_mask_link"].to(device)

            link_text_has_content = [torch.any(input_ids_link[i] != 0).item() for i in range(batch_size_current)]
            if any(link_text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_link, attention_mask=attention_mask_link).logits
                probs = torch.softmax(logits, dim=-1)
                entropy = -torch.sum(probs * torch.log2(probs + 1e-15), dim=1)

                preds = torch.argmax(probs, dim=-1).cpu().numpy()
                probs = probs.cpu().numpy()
                entropy = entropy.cpu().numpy()

                for i in range(batch_size_current):
                    if link_text_has_content[i]:
                        entropy_threshold = entropy_thresholds.get(preds[i], None)
                        passed_threshold = entropy[i] < entropy_threshold if entropy_threshold is not None else None

                        predictions_2.append(preds[i])
                        probabilities_2.append(probs[i].tolist())
                        entropies_2.append(entropy[i])
                        entropy_thresholds_2.append(entropy_threshold)
                        passed_thresholds_2.append(passed_threshold)
                    else:
                        predictions_2.append(None)
                        probabilities_2.append(None)
                        entropies_2.append(None)
                        entropy_thresholds_2.append(None)
                        passed_thresholds_2.append(None)
            else:
                predictions_2.extend([None] * batch_size_current)
                probabilities_2.extend([None] * batch_size_current)
                entropies_2.extend([None] * batch_size_current)
                entropy_thresholds_2.extend([None] * batch_size_current)
                passed_thresholds_2.extend([None] * batch_size_current)

    # Преобразуем предсказания в DataFrame
    test_df = test_df.reset_index(drop=True)
    test_df["predict_1"] = [idx2label[p] if p is not None else None for p in predictions_1]
    test_df["probability_1"] = probabilities_1
    test_df["entropy_1"] = entropies_1
    test_df["entropy_threshold_1"] = entropy_thresholds_1
    test_df["passed_threshold_1"] = passed_thresholds_1

    test_df["predict_2"] = [idx2label[p] if p is not None else None for p in predictions_2]
    test_df["probability_2"] = probabilities_2
    test_df["entropy_2"] = entropies_2
    test_df["entropy_threshold_2"] = entropy_thresholds_2
    test_df["passed_threshold_2"] = passed_thresholds_2

    return test_df


# Нужно уточнить значение entropy_thresholds полученное на предыдуще шаге

In [39]:
entropy_thresholds = {1: 0.5176525712013245, 0: 0.6887069940567017, 2: 0.3750338852405548}

test_results_df = test_model_on_dataset(
    model=model,
    dataset=dataset,
    idx2label=idx2label,
    batch_size=BATCH_SIZE,
    entropy_thresholds=entropy_thresholds,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

Processing:   0%|          | 0/108 [00:00<?, ?it/s]

In [40]:
test_results_df.sample(10)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,predict_1,probability_1,entropy_1,entropy_threshold_1,passed_threshold_1,predict_2,probability_2,entropy_2,entropy_threshold_2,passed_threshold_2
6313,UK against Covid-19,,670592507037280,none,,,,,,2020-03-25 07:33:14 EDT,...,Comments,"[7.50532271922566e-05, 0.0004611665790434927, 0.9994637370109558]",0.006913,0.375034,True,,,,,
3285,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,629521900975276,none,,,,27263.0,,2022-08-12 04:43:44 EDT,...,Comments,"[0.03655766695737839, 0.02743767946958542, 0.9360045790672302]",0.40616,0.375034,False,,,,,
1503,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,33248.0,,2020-03-27 17:54:04 EDT,...,Comments,"[0.00014592600928153843, 0.00045572727685794234, 0.9993983507156372]",0.007786,0.375034,True,Real,"[0.8154898285865784, 0.18059402704238892, 0.003916201181709766]",0.717201,0.688707,False
4404,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107640.0,,2021-01-25 23:51:25 EST,...,Comments,"[0.002318063285201788, 0.013403324410319328, 0.9842786192893982]",0.126177,0.375034,True,Comments,"[0.0025343268644064665, 0.003414792474359274, 0.994050920009613]",0.058394,0.375034,True
4641,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,33248.0,,2020-03-27 08:14:10 EDT,...,Comments,"[0.00020574108930304646, 0.0007525987457484007, 0.9990416169166565]",0.011711,0.375034,True,,,,,
2083,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,,,2020-03-22 12:43:10 EDT,...,,,,,,Fake,"[0.3974652588367462, 0.5987207889556885, 0.0038139517419040203]",1.002789,0.517653,False
1160,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,54140.0,,2020-03-30 07:39:24 EDT,...,Fake,"[0.324563592672348, 0.6288674473762512, 0.046569038182497025]",1.15377,0.517653,False,Real,"[0.5927907228469849, 0.4059179127216339, 0.00129135069437325]",0.987591,0.688707,False
6807,COVID Stories From Healthcare Workers & Patients,,766104867346558,none,,,,,,2020-12-23 09:55:00 EST,...,Fake,"[0.09781228750944138, 0.8766049742698669, 0.025582754984498024]",0.629902,0.517653,False,Real,"[0.709227442741394, 0.28966858983039856, 0.0011040500830858946]",0.880185,0.688707,False
4037,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110462.0,,2020-06-28 06:00:06 EDT,...,Comments,"[0.1558556705713272, 0.339207261800766, 0.5049370527267456]",1.444823,0.375034,False,,,,,
2871,"NHS, Key Workers And The World Appreciation Page",,1055974798109854,none,,,,,,2020-03-26 17:46:09 EDT,...,Comments,"[0.0036607368383556604, 0.01307300291955471, 0.9832662343978882]",0.135369,0.375034,True,,,,,


In [41]:
import pandas as pd

pd.options.display.max_colwidth = 300

columns_to_keep = [
    "text", "predict_1", "probability_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"
]

test_results_df.loc[
    (test_results_df["predict_1"] == "Fake") & (test_results_df["passed_threshold_1"] == True), 
    columns_to_keep
].sample(10)

In [42]:
import pandas as pd

pd.options.display.max_colwidth = 300

columns_to_keep = [
    "text", "predict_1", "probability_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"
]

test_results_df.loc[
    (test_results_df["predict_1"] == "Real") & (test_results_df["passed_threshold_1"] == True), 
    columns_to_keep
].sample(10)

In [43]:
test_results_df['predict_1'].value_counts()

predict_1
Comments    3416
Fake         945
Real         531
Name: count, dtype: int64

In [44]:
test_results_df.to_excel(DATA_PATH / 'facebook_data_to_complate.xlsx', index=False)