# Обучение определения фейковых фактов о COVID и вакцинации

In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [3]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [4]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 64

# Датасет

In [5]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME)
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,,,,,,,,189.33,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),


In [6]:
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

In [7]:
from typing import Dict, Tuple
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
import pandas as pd


class TokenizedDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        max_length: int,
        text_column: str = "text",
        link_text_column: str = "link_text",
        tensor_dtype: Tuple[torch.dtype, torch.dtype] = (torch.long, torch.long),
    ):
        """
        Инициализация датасета с ленивой токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками для токенизации.
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            text_column (str): Название основной текстовой колонки.
            link_text_column (str): Название колонки с дополнительным текстом.
            tensor_dtype (tuple): Типы данных для токенов (input_ids, attention_mask).
        """
        self.dataframe = dataframe.copy()  # Копируем DataFrame, чтобы избежать изменений в оригинале
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.link_text_column = link_text_column
        self.tensor_dtype = tensor_dtype

        # Проверка, есть ли указанные колонки в DataFrame
        if text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{text_column}' отсутствует в DataFrame")
        if link_text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{link_text_column}' отсутствует в DataFrame")

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.dataframe)

    def _tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """
        Токенизирует текст, если он не пустой, иначе возвращает тензоры с нулями.

        Args:
            text (str): Текст для токенизации.

        Returns:
            Dict[str, torch.Tensor]: Тензоры input_ids и attention_mask.
        """
        if pd.isna(text) or text.strip() == "":
            return {
                "input_ids": torch.zeros(self.max_length, dtype=self.tensor_dtype[0]),
                "attention_mask": torch.zeros(self.max_length, dtype=self.tensor_dtype[1]),
            }
        else:
            tokens = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": tokens["input_ids"][0].to(dtype=self.tensor_dtype[0]),
                "attention_mask": tokens["attention_mask"][0].to(dtype=self.tensor_dtype[1]),
            }

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами из двух колонок.
        """
        row = self.dataframe.iloc[idx]

        text_tokens = self._tokenize_text(row[self.text_column])

        link_text_tokens = self._tokenize_text(row[self.link_text_column])

        return {
            "input_ids_text": text_tokens["input_ids"],
            "attention_mask_text": text_tokens["attention_mask"],
            "input_ids_link": link_text_tokens["input_ids"],
            "attention_mask_link": link_text_tokens["attention_mask"],
        }


In [8]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME)

dataset = TokenizedDataset(data_df, tokenizer, MAX_LENGTH, text_column='text', link_text_column='link_text')

In [9]:
dataset[8]

{'input_ids_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids_link': tensor([    0, 34141, 39941,   231,    12,   180,    12,   2

# Модель

In [10]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME)

model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Тестирование

In [11]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd


def test_model_on_dataset(
    model: torch.nn.Module,
    dataset: TokenizedDataset,
    idx2label: dict,
    batch_size: int = 16,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Прогоняет датасет через модель и добавляет предсказания в DataFrame.

    Args:
        model (torch.nn.Module): Обученная модель.
        dataset (Dataset): Токенизированный датасет.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        batch_size (int): Размер батча для DataLoader.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с добавленными предсказаниями:
            - 'predict_1': предсказанная метка для основной колонки,
            - 'probability_1': вероятность предсказания для основной колонки,
            - 'predict_2': предсказанная метка для дополнительной колонки,
            - 'probability_2': вероятность предсказания для дополнительной колонки.
    """
    model.eval()
    model.to(device)

    # Создаём DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    test_df = dataset.dataframe.copy()

    # Списки для хранения предсказаний
    predictions_1, probabilities_1 = [], []
    predictions_2, probabilities_2 = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            batch_size_current = batch["input_ids_text"].shape[0]

            # Обрабатываем основную колонку (text)
            input_ids_text = batch["input_ids_text"].to(device)
            attention_mask_text = batch["attention_mask_text"].to(device)

            text_has_content = [torch.any(input_ids_text[i] != 0).item() for i in range(batch_size_current)]
            if any(text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_text, attention_mask=attention_mask_text).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if text_has_content[i]:
                        predictions_1.append(preds[i])
                        probabilities_1.append(probs[i].tolist())
                    else:
                        predictions_1.append(None)
                        probabilities_1.append(None)
            else:
                predictions_1.extend([None] * batch_size_current)
                probabilities_1.extend([None] * batch_size_current)

            # Обрабатываем дополнительную колонку (link_text)
            input_ids_link = batch["input_ids_link"].to(device)
            attention_mask_link = batch["attention_mask_link"].to(device)

            link_text_has_content = [torch.any(input_ids_link[i] != 0).item() for i in range(batch_size_current)]
            if any(link_text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_link, attention_mask=attention_mask_link).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
                preds = torch.argmax(logits, dim=-1).cpu().numpy()

                for i in range(batch_size_current):
                    if link_text_has_content[i]:
                        predictions_2.append(preds[i])
                        probabilities_2.append(probs[i].tolist())
                    else:
                        predictions_2.append(None)
                        probabilities_2.append(None)
            else:
                predictions_2.extend([None] * batch_size_current)
                probabilities_2.extend([None] * batch_size_current)

    # Преобразуем предсказания в DataFrame
    test_df = test_df.reset_index(drop=True)
    test_df["predict_1"] = [idx2label[p] if p is not None else None for p in predictions_1]
    test_df["probability_1"] = probabilities_1
    test_df["predict_2"] = [idx2label[p] if p is not None else None for p in predictions_2]
    test_df["probability_2"] = probabilities_2

    return test_df


In [12]:
test_results_df = test_model_on_dataset(
    model=model,
    dataset=dataset,
    idx2label=idx2label,
    batch_size=BATCH_SIZE,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

Processing:   0%|          | 0/108 [00:00<?, ?it/s]

In [17]:
test_results_df.sample(10)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text,predict_1,probability_1,predict_2,probability_2
4333,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110219.0,,2020-07-12 15:33:29 EDT,...,,,,1.47,,Yoly Robles - Social Media Promotions Here's a bilingual visual. 👇👇👇Maybe that way it's easier to understand 🤷‍♀️🤷‍♀️👇👇,,,Real,"[0.7545319199562073, 0.24436169862747192, 0.0011063381098210812]"
2672,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107688.0,,2020-12-05 18:03:41 EST,...,,,,2.48,"People with glasses...how do you handle a N95 and face shield and still see? Like I feel like I am bumbling around like a toddler, my depth perception seems to be off.",,Comments,"[0.00012494047405198216, 0.0007247808389365673, 0.999150276184082]",,
4343,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107665.0,,2020-12-23 22:56:30 EST,...,,,,1.46,Vaccine ✔️✔️✔️,,Fake,"[0.14518825709819794, 0.8448783755302429, 0.009933377616107464]",,
3998,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110769.0,,2020-06-05 21:14:12 EDT,...,,,,1.62,"Not sure if folks will find this interesting but managing COVID is about more than just science...there are huge public policy questions related to how we organize ourselves, manage the COVID response etc. In British Columbia, one public health physician has led our response which increasingly appears very effective. At a time when there is lots of meanness in general and lots of critical commentary about pandemic response, this lady is all about kindness. For those of us who occupy leadersh...","The Top Doctor Who Aced the Coronavirus Test Dr. Bonnie Henry kept the disease in check in British Columbia without harsh enforcement methods. Now, she is leading the way out of lockdown.",Comments,"[0.13353030383586884, 0.1755475550889969, 0.6909221410751343]",Real,"[0.5223469138145447, 0.4762496054172516, 0.0014035163912922144]"
3639,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107710.0,,2021-01-18 14:50:53 EST,...,,,,1.8,So happy to be part of the beginning of the end of Covid19!,,Fake,"[0.1162431389093399, 0.8755315542221069, 0.008225289173424244]",,
2237,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 23:31:12 EDT,...,,,,2.98,,"Some Coronavirus Patients Show Signs of Brain Ailments Doctors have observed neurological symptoms, including confusion, stroke and seizures, in a small subset of Covid-19 patients.",,,Fake,"[0.20593039691448212, 0.7908771634101868, 0.0031924720387905836]"
2113,"NHS, Key Workers And The World Appreciation Page",,1055974798109854,none,,,,,,2020-03-26 16:42:49 EDT,...,,,,3.17,"Lets do it again. Every night 8pm. 2 minute claps, cheers, fireworks, honk them horns and whatever else we can do!!! Lets celebrate everything and everyone keeping us moving during Covid !!!! Get this shared and get this going!! Post pictures videos anything. Pictures in your windows your kids draw !! Come on UK. Every single night till we beat this !!!!! Yano what every country 8pm show your support to them !!! Please everyone share!! Everyone invite. Get this going and show we care for tho...",,Comments,"[0.00631249975413084, 0.01087009534239769, 0.9828174114227295]",,
1608,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,629521900975276,none,,,,,,2022-04-23 04:23:59 EDT,...,,,,4.23,,,,,,
609,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,110870.0,,2020-06-03 17:33:58 EDT,...,,,,10.19,"Article on healthcare worker infections rates and deaths globally, with input from the ICN (International Council of Nurses). Whats shocking is that according to their sources, rates of infection for nurses in Ireland run as high as 30%. ""Infection rates among healthcare workers vary greatly between countries, with fewer than 1% in Singapore and more than 30% in Ireland, it said.""","COVID-19 death toll among nurses doubled in past month, says nurses group More than 600 nurses worldwide are known to have died from COVID-19, which has infected an estimated 450,000 healthcare workers, the International Council of Nurses said on Wednesday.",Real,"[0.7326670289039612, 0.25797009468078613, 0.009362868964672089]",Fake,"[0.35285013914108276, 0.6447829604148865, 0.0023668843787163496]"
5662,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,62437.0,,2020-04-01 17:17:24 EDT,...,,,,1.07,What do you think the aftermath of this experience will be for us in health care... and or in the general public?,,Comments,"[0.0022377441637218, 0.0023775917943567038, 0.9953846335411072]",,


In [15]:
test_results_df['predict_1'].value_counts()

Comments    3474
Fake         950
Real         542
Name: predict_1, dtype: int64

In [14]:
test_results_df.to_excel(DATA_PATH / 'facebook_data_to_complate.xlsx', index=False)