# Обучение определения фейковых фактов о COVID и вакцинации

In [16]:
import math

import torch
import pandas as pd
import numpy as np

In [17]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [18]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [19]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 64

# Датасет

In [20]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME)
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,,,,,,,,189.33,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),


In [21]:
idx2label = {
    0: "Real",
    1: "Fake"
}

In [22]:
from typing import Dict, Tuple
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
import pandas as pd


class TokenizedDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        max_length: int,
        text_column: str = "text",
        link_text_column: str = "link_text",
        tensor_dtype: Tuple[torch.dtype, torch.dtype] = (torch.long, torch.long),
    ):
        """
        Инициализация датасета с ленивой токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками для токенизации.
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            text_column (str): Название основной текстовой колонки.
            link_text_column (str): Название колонки с дополнительным текстом.
            tensor_dtype (tuple): Типы данных для токенов (input_ids, attention_mask).
        """
        self.dataframe = dataframe.copy()  # Копируем DataFrame, чтобы избежать изменений в оригинале
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.link_text_column = link_text_column
        self.tensor_dtype = tensor_dtype

        # Проверка, есть ли указанные колонки в DataFrame
        if text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{text_column}' отсутствует в DataFrame")
        if link_text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{link_text_column}' отсутствует в DataFrame")

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.dataframe)

    def _tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """
        Токенизирует текст, если он не пустой, иначе возвращает тензоры с нулями.

        Args:
            text (str): Текст для токенизации.

        Returns:
            Dict[str, torch.Tensor]: Тензоры input_ids и attention_mask.
        """
        if pd.isna(text) or text.strip() == "":
            return {
                "input_ids": torch.zeros(self.max_length, dtype=self.tensor_dtype[0]),
                "attention_mask": torch.zeros(self.max_length, dtype=self.tensor_dtype[1]),
            }
        else:
            tokens = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": tokens["input_ids"][0].to(dtype=self.tensor_dtype[0]),
                "attention_mask": tokens["attention_mask"][0].to(dtype=self.tensor_dtype[1]),
            }

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами из двух колонок.
        """
        row = self.dataframe.iloc[idx]

        text_tokens = self._tokenize_text(row[self.text_column])

        link_text_tokens = self._tokenize_text(row[self.link_text_column])

        return {
            "input_ids_text": text_tokens["input_ids"],
            "attention_mask_text": text_tokens["attention_mask"],
            "input_ids_link": link_text_tokens["input_ids"],
            "attention_mask_link": link_text_tokens["attention_mask"],
        }


In [23]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME)

dataset = TokenizedDataset(data_df, tokenizer, MAX_LENGTH, text_column='text', link_text_column='link_text')

In [24]:
dataset[8]

{'input_ids_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids_link': tensor([    0, 34141, 39941,   231,    12,   180,    12,   2

# Модель

In [25]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME)

model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Тестирование

In [26]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd


def test_model_on_dataset(
    model: torch.nn.Module,
    dataset: TokenizedDataset,
    idx2label: dict,
    batch_size: int = 16,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Прогоняет датасет через модель и добавляет предсказания в DataFrame.

    Args:
        model (torch.nn.Module): Обученная модель.
        dataset (Dataset): Токенизированный датасет.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        batch_size (int): Размер батча для DataLoader.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с добавленными предсказаниями:
            - 'predict_1': предсказанная метка для основной колонки,
            - 'probability_1': вероятность предсказания для основной колонки,
            - 'predict_2': предсказанная метка для дополнительной колонки,
            - 'probability_2': вероятность предсказания для дополнительной колонки.
    """
    model.eval()
    model.to(device)

    # Создаём DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    test_df = dataset.dataframe.copy()

    # Списки для хранения предсказаний
    predictions_1, probabilities_1 = [], []
    predictions_2, probabilities_2 = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            batch_size_current = batch["input_ids_text"].shape[0]

            # Обрабатываем основную колонку (text)
            input_ids_text = batch["input_ids_text"].to(device)
            attention_mask_text = batch["attention_mask_text"].to(device)

            text_has_content = [torch.any(input_ids_text[i] != 0).item() for i in range(batch_size_current)]
            if any(text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_text, attention_mask=attention_mask_text).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if text_has_content[i]:
                        predictions_1.append(preds[i])
                        probabilities_1.append(probs[i].tolist())
                    else:
                        predictions_1.append(None)
                        probabilities_1.append(None)
            else:
                predictions_1.extend([None] * batch_size_current)
                probabilities_1.extend([None] * batch_size_current)

            # Обрабатываем дополнительную колонку (link_text)
            input_ids_link = batch["input_ids_link"].to(device)
            attention_mask_link = batch["attention_mask_link"].to(device)

            link_text_has_content = [torch.any(input_ids_link[i] != 0).item() for i in range(batch_size_current)]
            if any(link_text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_link, attention_mask=attention_mask_link).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if link_text_has_content[i]:
                        predictions_2.append(preds[i])
                        probabilities_2.append(probs[i].tolist())
                    else:
                        predictions_2.append(None)
                        probabilities_2.append(None)
            else:
                predictions_2.extend([None] * batch_size_current)
                probabilities_2.extend([None] * batch_size_current)

    # Преобразуем предсказания в DataFrame
    test_df = test_df.reset_index(drop=True)
    test_df["predict_1"] = [idx2label[p] if p is not None else None for p in predictions_1]
    test_df["probability_1"] = probabilities_1
    test_df["predict_2"] = [idx2label[p] if p is not None else None for p in predictions_2]
    test_df["probability_2"] = probabilities_2

    return test_df


In [27]:
test_results_df = test_model_on_dataset(
    model=model,
    dataset=dataset,
    idx2label=idx2label,
    batch_size=BATCH_SIZE,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

Processing:   0%|          | 0/108 [00:00<?, ?it/s]

In [28]:
test_results_df.sample(10)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text,predict_1,probability_1,predict_2,probability_2
5399,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107559.0,,2021-02-06 08:41:34 EST,...,,,,1.14,Hey was wondering if anyone that received both doses of the vaccine got tested for antibodies yet?? And how long after?? I keep hearing mixed time frames!!,,Fake,"[0.0518883541226387, 0.9481115937232971]",,
4550,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,18212.0,,2020-03-25 18:01:24 EDT,...,,,,1.39,"#3 Westchester NY LMT, INHC Oh thank god ! General Motors, Ford, 3M and the negotiation ability’s of States at the forefront",KARE 11 WATCH LIVE: The White House coronavirus task force provides an update in its daily briefing.,Real,"[0.9917660355567932, 0.008234018459916115]",Real,"[0.8347105383872986, 0.16528943181037903]"
5692,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,629521900975276,none,,,,15657.0,,2022-05-16 10:50:34 EDT,...,,,,1.05,Interview taking place every week for NHS Trust in ESSEX if you have passed ILETS or OET please contact me at fabiola@global-medical-pro.com You must have minimum of 12 months experience with some experience in mental health and are looking to pursue a career in mental health in the UK this is a job opportunity for you😊,"Global Medical Professionals Our mental health is something most of us avoid seeking help for…\n\nI often question this because, if we broke a bone we would seek help, if we were in pain elsewhere we would seek medical help. \n\nOur mental health is also so important and when things are not right, when we are struggling mentally, we should ask for help. \n\nMental health services are in demand here in the U.K. and that means the demand for more experienced mental health nurses.\n\nLike all n...",Real,"[0.990220308303833, 0.00977969728410244]",Real,"[0.6824526190757751, 0.31754738092422485]"
1969,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,108534.0,,2021-10-31 10:25:09 EDT,...,,,,3.41,Could someone explain exactly how the vaccine causes myocarditis?,,Fake,"[0.0009460931178182364, 0.9990538954734802]",,
1905,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,1570841303216433,none,,,,33338.0,,2021-04-23 00:45:37 EDT,...,,,,3.55,,,,,,
2882,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,629521900975276,none,,,,36597.0,,2022-11-08 12:06:44 EST,...,,,,2.3,,"The Salvation Army’s Older People’s Services Do you want to work in a Care Home with a difference? Everything we do for our Older People’s Service is “Rooted in Love” our values of Integrity, compassion, passion, respect; boldness and accountability are at the heart of all we do.\n\nTeam Leader\n\nLocation: Youell Court Care - Care Home, Skipworth Road, Binley, Coventry, CV3 2XA\n\nWorking hours: 35 hours per week\n\nContract: Permanent\n\nhttps://uk.indeed.com/rc/clk?jk=8eb27fc45bdf6e9b&fcc...",,,Fake,"[0.2869172990322113, 0.7130827307701111]"
5124,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,1570841303216433,none,,,,45334.0,,2022-03-16 06:09:22 EDT,...,,,,1.21,WhatsApp me on +971556878369 Or email your docs to sherin.rose@neptuneinternational.org,,Fake,"[0.04681093245744705, 0.9531890749931335]",,
3547,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,1570841303216433,none,,,,32178.0,,2020-11-19 16:06:12 EST,...,,,,1.85,,,,,,
5730,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,108383.0,,2021-11-14 23:46:13 EST,...,,,,1.05,Just had 2 friends test positive for Covid. We called them (they are husband and wife). Explained that while they had at home tests that were positive they should check with their docs and if symptoms get worse to head to the hospital. Needless to say our suggestions were not taken well. Trying to help....they don't want it. Ugh!!!! They are our best friends. No response needed ...just want to vent. Also....our nephew (who was positive weeks ago) is doing great. Home ...no O2. Thanks to all ...,,Fake,"[0.2784062325954437, 0.7215937972068787]",,
537,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110462.0,,2020-06-28 05:46:50 EDT,...,,,,11.52,"Just taking a short break from COVID related posts. As a CNA with many high-fall risk patients, this is too relatable",,Real,"[0.8647623062133789, 0.1352376490831375]",,


In [29]:
test_results_df.to_excel(DATA_PATH / 'facebook_data_to_complate.xlsx', index=False)