# Обучение определения фейковых фактов о COVID и вакцинации

In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [3]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [4]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 64

# Датасет

In [5]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME)
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,,,,,,,,189.33,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),


In [6]:
idx2label = {
    0: "Real",
    1: "Fake"
}

In [7]:
from typing import Dict, Tuple
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
import pandas as pd


class TokenizedDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        max_length: int,
        text_column: str = "text",
        link_text_column: str = "link_text",
        tensor_dtype: Tuple[torch.dtype, torch.dtype] = (torch.long, torch.long),
    ):
        """
        Инициализация датасета с ленивой токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками для токенизации.
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            text_column (str): Название основной текстовой колонки.
            link_text_column (str): Название колонки с дополнительным текстом.
            tensor_dtype (tuple): Типы данных для токенов (input_ids, attention_mask).
        """
        self.dataframe = dataframe.copy()  # Копируем DataFrame, чтобы избежать изменений в оригинале
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.link_text_column = link_text_column
        self.tensor_dtype = tensor_dtype

        # Проверка, есть ли указанные колонки в DataFrame
        if text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{text_column}' отсутствует в DataFrame")
        if link_text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{link_text_column}' отсутствует в DataFrame")

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.dataframe)

    def _tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """
        Токенизирует текст, если он не пустой, иначе возвращает тензоры с нулями.

        Args:
            text (str): Текст для токенизации.

        Returns:
            Dict[str, torch.Tensor]: Тензоры input_ids и attention_mask.
        """
        if pd.isna(text) or text.strip() == "":
            return {
                "input_ids": torch.zeros(self.max_length, dtype=self.tensor_dtype[0]),
                "attention_mask": torch.zeros(self.max_length, dtype=self.tensor_dtype[1]),
            }
        else:
            tokens = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": tokens["input_ids"][0].to(dtype=self.tensor_dtype[0]),
                "attention_mask": tokens["attention_mask"][0].to(dtype=self.tensor_dtype[1]),
            }

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами из двух колонок.
        """
        row = self.dataframe.iloc[idx]

        text_tokens = self._tokenize_text(row[self.text_column])

        link_text_tokens = self._tokenize_text(row[self.link_text_column])

        return {
            "input_ids_text": text_tokens["input_ids"],
            "attention_mask_text": text_tokens["attention_mask"],
            "input_ids_link": link_text_tokens["input_ids"],
            "attention_mask_link": link_text_tokens["attention_mask"],
        }


In [8]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME)

dataset = TokenizedDataset(data_df, tokenizer, MAX_LENGTH, text_column='text', link_text_column='link_text')

In [9]:
dataset[8]

{'input_ids_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids_link': tensor([    0, 34141, 39941,   231,    12,   180,    12,   2

# Модель

In [10]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME)

model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Тестирование

In [13]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd


def test_model_on_dataset(
    model: torch.nn.Module,
    dataset: TokenizedDataset,
    idx2label: dict,
    batch_size: int = 16,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Прогоняет датасет через модель и добавляет предсказания в DataFrame.

    Args:
        model (torch.nn.Module): Обученная модель.
        dataset (Dataset): Токенизированный датасет.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        batch_size (int): Размер батча для DataLoader.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с добавленными предсказаниями:
            - 'predict_1': предсказанная метка для основной колонки,
            - 'probability_1': вероятность предсказания для основной колонки,
            - 'predict_2': предсказанная метка для дополнительной колонки,
            - 'probability_2': вероятность предсказания для дополнительной колонки.
    """
    model.eval()
    model.to(device)

    # Создаём DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    test_df = dataset.dataframe.copy()

    # Списки для хранения предсказаний
    predictions_1, probabilities_1 = [], []
    predictions_2, probabilities_2 = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            batch_size_current = batch["input_ids_text"].shape[0]

            # Обрабатываем основную колонку (text)
            input_ids_text = batch["input_ids_text"].to(device)
            attention_mask_text = batch["attention_mask_text"].to(device)

            text_has_content = [torch.any(input_ids_text[i] != 0).item() for i in range(batch_size_current)]
            if any(text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_text, attention_mask=attention_mask_text).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if text_has_content[i]:
                        predictions_1.append(preds[i])
                        probabilities_1.append(probs[i].tolist())
                    else:
                        predictions_1.append(None)
                        probabilities_1.append(None)
            else:
                predictions_1.extend([None] * batch_size_current)
                probabilities_1.extend([None] * batch_size_current)

            # Обрабатываем дополнительную колонку (link_text)
            input_ids_link = batch["input_ids_link"].to(device)
            attention_mask_link = batch["attention_mask_link"].to(device)

            link_text_has_content = [torch.any(input_ids_link[i] != 0).item() for i in range(batch_size_current)]
            if any(link_text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_link, attention_mask=attention_mask_link).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if link_text_has_content[i]:
                        predictions_2.append(preds[i])
                        probabilities_2.append(probs[i].tolist())
                    else:
                        predictions_2.append(None)
                        probabilities_2.append(None)
            else:
                predictions_2.extend([None] * batch_size_current)
                probabilities_2.extend([None] * batch_size_current)

    # Преобразуем предсказания в DataFrame
    test_df = test_df.reset_index(drop=True)
    test_df["predict_1"] = [idx2label[p] if p is not None else None for p in predictions_1]
    test_df["probability_1"] = probabilities_1
    test_df["predict_2"] = [idx2label[p] if p is not None else None for p in predictions_2]
    test_df["probability_2"] = probabilities_2

    return test_df


In [14]:
test_results_df = test_model_on_dataset(
    model=model,
    dataset=dataset,
    idx2label=idx2label,
    batch_size=BATCH_SIZE,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

Processing:   0%|          | 0/108 [00:00<?, ?it/s]

In [15]:
test_results_df.sample(10)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text,predict_1,probability_1,predict_2,probability_2
1312,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,109718.0,,2020-08-03 15:20:48 EDT,...,,,,5.08,,,,,,
4449,Protect Essential Workers - Global Coronavirus Action,,641756303052335,none,,,,,,2020-04-20 13:20:08 EDT,...,,,,1.42,"We are launching a global day of action on April 28. Too many of our colleagues, our friends and families, have already paid the ultimate price for the failures of our governments and employers around the world. Health workers are on the frontlines without, or with inadequate, Personal Protective Equipment or testing. Care workers are turning away from older people to sneeze. It's the only protection they both have. Non-essential work in construction, the service sector and industry undermin...",Coronavirus Global Day of Action We are calling on workers everywhere to join our global day of action for safety and security. This Workers Memorial Day - 28 April 2020 - our slogan means more than ever. Add your action to our map today.,Real,"[0.6934360265731812, 0.30656397342681885]",Real,"[0.9977990984916687, 0.002200827933847904]"
4744,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107665.0,,2020-12-24 18:32:31 EST,...,,,,1.33,"Does anyone know what happens if say, we get the first shot and then decide to back out of the second? Basically like getting no shot at all? Just curious...it's the second one that makes me nervous with my brain/gut disorder (cvs).",,Fake,"[0.00819011777639389, 0.9918099045753479]",,
2109,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,12548.0,,2020-03-23 18:54:57 EDT,...,,,,3.17,,,,,,
60,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110166.0,,2020-07-15 08:10:13 EDT,...,,,,46.58,,France to give $9 billion in pay raises to health care workers,,,Fake,"[0.35708892345428467, 0.6429110169410706]"
1939,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107457.0,,2021-02-15 18:33:37 EST,...,,,,3.47,"8,400 vaccines redistributed when Texas power outage cut energy to vaccine freezers and back-up generator failed.. https://www.consumerreports.org/home-maintenance-repairs/how-to-keep-pipes-from-freezing/ 24 of 50 states are being affected by these weather systems.. 3 million Texans in power outage. They are purposefully turning portions of grid off to prevent further outage. Watching the weather issues across the mid-west from New York. My own home state of Minnesota was between -35 to -45 ...",,Real,"[0.9755354523658752, 0.024464568123221397]",,
3844,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107781.0,,2021-01-11 11:23:35 EST,...,,,,1.69,"Got my 2nd dose, Pfizer, on Sat. @ 10 am. By 8 pm had some nausea, sore arm, back pain and low grade fever. Over the next 24 hrs: same plus chills and sweats. But woke up today and everything is resolved. Good luck team!!!",,Fake,"[0.11500360816717148, 0.8849963545799255]",,
5587,Covid19 Real Stories by Frontline and Affected People,,3938079882870550,none,,,,4211.0,,2020-07-04 12:50:32 EDT,...,,,,1.1,,Scientists say WHO ignores the risk that coronavirus floats in air as aerosol More than 200 researchers worldwide sign an open letter saying current guidance ignores evidence that the coronavirus readily spreads on microscopic particles known as aerosols that can hang in the air for long periods and float dozens of feet.,,,Fake,"[0.4271196126937866, 0.5728803277015686]"
2571,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110397.0,,2020-06-22 21:24:07 EDT,...,,,,2.59,I work in a small hospital outside Houston. We test about 150+ a day. The positive numbers are definitely going up. But I'm noticing the symptoms are less severe among our positives.Is anyone noticing the same thing?,,Real,"[0.9826356172561646, 0.0173643846064806]",,
3117,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,109897.0,,2020-04-28 01:25:52 EDT,...,,,,2.12,,"New York State Nurses Association Crisis standards ≠ scientific standards. This pandemic is no excuse to prop up unproven methods for cleaning disposable respirators. Read our COVID-19 Protection Bulletin on the dangers of ""cleaning"" and reusing N95 respirators: bit.ly/noPPEreuse ⚠️",,,Real,"[0.5847894549369812, 0.4152105748653412]"


In [None]:
test_results_df.to_excel(DATA_PATH / 'facebook_data_to_complate.xlsx', index=False)