# Обучение определения фейковых фактов о COVID и вакцинации

In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [3]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [4]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 64

# Датасет

In [5]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME)
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,,,,,,,,189.33,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),


In [6]:
# data_df = data_df.sample(500)

In [7]:
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

In [8]:
from typing import Dict, Tuple
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
import pandas as pd


class TokenizedDataset(Dataset):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        max_length: int,
        text_column: str = "text",
        link_text_column: str = "link_text",
        tensor_dtype: Tuple[torch.dtype, torch.dtype] = (torch.long, torch.long),
    ):
        """
        Инициализация датасета с ленивой токенизацией.

        Args:
            dataframe (pd.DataFrame): DataFrame с колонками для токенизации.
            tokenizer (PreTrainedTokenizer): Токенайзер для преобразования текста.
            max_length (int): Максимальная длина токенов.
            text_column (str): Название основной текстовой колонки.
            link_text_column (str): Название колонки с дополнительным текстом.
            tensor_dtype (tuple): Типы данных для токенов (input_ids, attention_mask).
        """
        self.dataframe = dataframe.copy()  # Копируем DataFrame, чтобы избежать изменений в оригинале
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.link_text_column = link_text_column
        self.tensor_dtype = tensor_dtype

        # Проверка, есть ли указанные колонки в DataFrame
        if text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{text_column}' отсутствует в DataFrame")
        if link_text_column not in dataframe.columns:
            raise ValueError(f"Колонка '{link_text_column}' отсутствует в DataFrame")

    def __len__(self) -> int:
        """
        Возвращает количество примеров в датасете.
        """
        return len(self.dataframe)

    def _tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """
        Токенизирует текст, если он не пустой, иначе возвращает тензоры с нулями.

        Args:
            text (str): Текст для токенизации.

        Returns:
            Dict[str, torch.Tensor]: Тензоры input_ids и attention_mask.
        """
        if pd.isna(text) or text.strip() == "":
            return {
                "input_ids": torch.zeros(self.max_length, dtype=self.tensor_dtype[0]),
                "attention_mask": torch.zeros(self.max_length, dtype=self.tensor_dtype[1]),
            }
        else:
            tokens = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": tokens["input_ids"][0].to(dtype=self.tensor_dtype[0]),
                "attention_mask": tokens["attention_mask"][0].to(dtype=self.tensor_dtype[1]),
            }

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Возвращает токенизированные данные.

        Args:
            idx (int): Индекс примера.

        Returns:
            Dict[str, torch.Tensor]: Словарь с токенами из двух колонок.
        """
        row = self.dataframe.iloc[idx]

        text_tokens = self._tokenize_text(row[self.text_column])

        link_text_tokens = self._tokenize_text(row[self.link_text_column])

        return {
            "input_ids_text": text_tokens["input_ids"],
            "attention_mask_text": text_tokens["attention_mask"],
            "input_ids_link": link_text_tokens["input_ids"],
            "attention_mask_link": link_text_tokens["attention_mask"],
        }


In [9]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME)

dataset = TokenizedDataset(data_df, tokenizer, MAX_LENGTH, text_column='text', link_text_column='link_text')

In [10]:
dataset[8]

{'input_ids_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask_text': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids_link': tensor([    0, 40827,   254, 22208,  5504,  5560,  2799,    

# Модель

In [11]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME)

model.to(DEVICE)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Тестирование

In [12]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import pandas as pd
import torch.nn.functional as F
from typing import Dict


def test_model_on_dataset(
    model: torch.nn.Module,
    dataset: TokenizedDataset,
    idx2label: Dict[int, str],
    entropy_thresholds: Dict[int, float],  # Пороги энтропии по категориям
    batch_size: int = 16,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> pd.DataFrame:
    """
    Прогоняет датасет через модель и добавляет предсказания, энтропию и её пороги в DataFrame.

    Args:
        model (torch.nn.Module): Обученная модель.
        dataset (Dataset): Токенизированный датасет.
        idx2label (dict): Словарь, отображающий индексы категорий в названия.
        entropy_thresholds (dict): Оптимальные пороги энтропии по категориям.
        batch_size (int): Размер батча для DataLoader.
        device (torch.device): Устройство для вычислений (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame с добавленными предсказаниями, включая:
            - 'predict_1', 'probability_1', 'entropy_1', 'entropy_threshold_1', 'passed_threshold_1'
            - 'predict_2', 'probability_2', 'entropy_2', 'entropy_threshold_2', 'passed_threshold_2'
    """
    model.eval()
    model.to(device)

    # Создаём DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    test_df = dataset.dataframe.copy()

    # Списки для хранения предсказаний и метрик
    predictions_1, probabilities_1, entropies_1, entropy_thresholds_1, passed_thresholds_1 = [], [], [], [], []
    predictions_2, probabilities_2, entropies_2, entropy_thresholds_2, passed_thresholds_2 = [], [], [], [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            batch_size_current = batch["input_ids_text"].shape[0]

            # === Обрабатываем основную колонку (text) ===
            input_ids_text = batch["input_ids_text"].to(device)
            attention_mask_text = batch["attention_mask_text"].to(device)

            text_has_content = [torch.any(input_ids_text[i] != 0).item() for i in range(batch_size_current)]
            if any(text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_text, attention_mask=attention_mask_text).logits
                probs = torch.softmax(logits, dim=-1)
                entropy = -torch.sum(probs * torch.log2(probs + 1e-15), dim=1)

                preds = torch.argmax(probs, dim=-1).cpu().numpy()
                probs = probs.cpu().numpy()
                entropy = entropy.cpu().numpy()

                for i in range(batch_size_current):
                    if text_has_content[i]:
                        entropy_threshold = entropy_thresholds.get(preds[i], None)
                        passed_threshold = entropy[i] > entropy_threshold if entropy_threshold is not None else None

                        predictions_1.append(preds[i])
                        probabilities_1.append(probs[i].tolist())
                        entropies_1.append(entropy[i])
                        entropy_thresholds_1.append(entropy_threshold)
                        passed_thresholds_1.append(passed_threshold)
                    else:
                        predictions_1.append(None)
                        probabilities_1.append(None)
                        entropies_1.append(None)
                        entropy_thresholds_1.append(None)
                        passed_thresholds_1.append(None)
            else:
                predictions_1.extend([None] * batch_size_current)
                probabilities_1.extend([None] * batch_size_current)
                entropies_1.extend([None] * batch_size_current)
                entropy_thresholds_1.extend([None] * batch_size_current)
                passed_thresholds_1.extend([None] * batch_size_current)

            # === Обрабатываем дополнительную колонку (link_text) ===
            input_ids_link = batch["input_ids_link"].to(device)
            attention_mask_link = batch["attention_mask_link"].to(device)

            link_text_has_content = [torch.any(input_ids_link[i] != 0).item() for i in range(batch_size_current)]
            if any(link_text_has_content):  # Если хотя бы в одном есть текст
                logits = model(input_ids=input_ids_link, attention_mask=attention_mask_link).logits
                probs = torch.softmax(logits, dim=-1)
                entropy = -torch.sum(probs * torch.log2(probs + 1e-15), dim=1)

                preds = torch.argmax(probs, dim=-1).cpu().numpy()
                probs = probs.cpu().numpy()
                entropy = entropy.cpu().numpy()

                for i in range(batch_size_current):
                    if link_text_has_content[i]:
                        entropy_threshold = entropy_thresholds.get(preds[i], None)
                        passed_threshold = entropy[i] < entropy_threshold if entropy_threshold is not None else None

                        predictions_2.append(preds[i])
                        probabilities_2.append(probs[i].tolist())
                        entropies_2.append(entropy[i])
                        entropy_thresholds_2.append(entropy_threshold)
                        passed_thresholds_2.append(passed_threshold)
                    else:
                        predictions_2.append(None)
                        probabilities_2.append(None)
                        entropies_2.append(None)
                        entropy_thresholds_2.append(None)
                        passed_thresholds_2.append(None)
            else:
                predictions_2.extend([None] * batch_size_current)
                probabilities_2.extend([None] * batch_size_current)
                entropies_2.extend([None] * batch_size_current)
                entropy_thresholds_2.extend([None] * batch_size_current)
                passed_thresholds_2.extend([None] * batch_size_current)

    # Преобразуем предсказания в DataFrame
    test_df = test_df.reset_index(drop=True)
    test_df["predict_1"] = [idx2label[p] if p is not None else None for p in predictions_1]
    test_df["probability_1"] = probabilities_1
    test_df["entropy_1"] = entropies_1
    test_df["entropy_threshold_1"] = entropy_thresholds_1
    test_df["passed_threshold_1"] = passed_thresholds_1

    test_df["predict_2"] = [idx2label[p] if p is not None else None for p in predictions_2]
    test_df["probability_2"] = probabilities_2
    test_df["entropy_2"] = entropies_2
    test_df["entropy_threshold_2"] = entropy_thresholds_2
    test_df["passed_threshold_2"] = passed_thresholds_2

    return test_df


In [13]:
entropy_thresholds = {1: 0.5176525712013245, 0: 0.6887069940567017, 2: 0.3750338852405548}

test_results_df = test_model_on_dataset(
    model=model,
    dataset=dataset,
    idx2label=idx2label,
    batch_size=BATCH_SIZE,
    entropy_thresholds=entropy_thresholds,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

Processing:   0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
test_results_df.sample(10)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,predict_1,probability_1,entropy_1,entropy_threshold_1,passed_threshold_1,predict_2,probability_2,entropy_2,entropy_threshold_2,passed_threshold_2
103,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,109582.0,,2020-08-11 21:46:31 EDT,...,Comments,"[0.00010242078133160248, 0.000590130512136966, 0.99930739402771]",0.008686,0.375034,False,Comments,"[0.000405871425755322, 0.003203626722097397, 0.9963905215263367]",0.036316,0.375034,True
480,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,48860.0,,2020-03-29 15:58:29 EDT,...,Comments,"[0.009789294563233852, 0.022784806787967682, 0.9674258828163147]",0.235869,0.375034,False,,,,,
124,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107766.0,,2020-12-17 23:30:26 EST,...,Comments,"[0.0003218159545212984, 0.001806577667593956, 0.9978716373443604]",0.023263,0.375034,False,,,,,
432,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107842.0,,2020-11-18 20:41:03 EST,...,Comments,"[0.06370309740304947, 0.16327203810214996, 0.7730249166488647]",0.967071,0.375034,True,,,,,
9,Covid 19 - Newmill Community Support,,1556784341144788,none,,,,,,2020-04-16 15:50:01 EDT,...,Comments,"[0.005032265558838844, 0.0070299385115504265, 0.9879377484321594]",0.105996,0.375034,False,,,,,
315,Covid19 Real Stories by Frontline and Affected People,,3938079882870550,none,,,,,,2020-04-16 10:20:48 EDT,...,Real,"[0.7864987850189209, 0.21058917045593262, 0.002912011230364442]",0.770338,0.688707,True,Fake,"[0.23777706921100616, 0.7581254839897156, 0.004097355529665947]",0.828111,0.517653,False
76,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107753.0,,2020-12-07 20:46:45 EST,...,,,,,,Real,"[0.8064526915550232, 0.19045688211917877, 0.0030904437880963087]",0.731702,0.688707,False
144,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,111149.0,,2020-05-28 15:26:37 EDT,...,Comments,"[0.2533581852912903, 0.25821736454963684, 0.4884243905544281]",1.511156,0.375034,True,,,,,
474,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107795.0,,2021-01-09 00:02:13 EST,...,Comments,"[0.029040303081274033, 0.07921350002288818, 0.8917461633682251]",0.585447,0.375034,True,,,,,
19,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,1570841303216433,none,,,,34205.0,,2021-08-02 07:33:10 EDT,...,Comments,"[0.005987084470689297, 0.007517407648265362, 0.9864954948425293]",0.116598,0.375034,False,Fake,"[0.19375079870224, 0.6918608546257019, 0.1143883466720581]",1.184241,0.517653,False


In [15]:
import pandas as pd

pd.options.display.max_colwidth = 300

columns_to_keep = [
    "text", "predict_1", "probability_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"
]

test_results_df.loc[
    (test_results_df["predict_1"] == "Fake") & (test_results_df["passed_threshold_1"] == True), 
    columns_to_keep
].sample(10)

Unnamed: 0,text,predict_1,probability_1,entropy_1,entropy_threshold_1,passed_threshold_1
7,Judges are apparently forcing hospitals to administer ivermectin in multiple cases. I’ve never heard of anything like this before.,Fake,"[0.3932177424430847, 0.5643640756607056, 0.04241819679737091]",1.18867,0.517653,True
39,How often are healthcare workers whom are in close contact care with positive Covid patients being tested themselves? Or what’s the recommendation for your facilities?,Fake,"[0.3282310664653778, 0.6674463152885437, 0.004322689026594162]",0.950794,0.517653,True
408,"Meanwhile in India during the devastating second wave, doctors are getting brutally assaulted.",Fake,"[0.26339009404182434, 0.7233384251594543, 0.013271420262753963]",0.927694,0.517653,True
5,Merry Christmas Eve from our COVID unit to yours 😘,Fake,"[0.2204810380935669, 0.7764163613319397, 0.003102540737017989]",0.790249,0.517653,True
17,Just out of curiosity does anyone else get the feeling that some are on here because they are scared and know we’re all a medical team? Please delete if inappropriate.,Fake,"[0.2516616880893707, 0.46662119030952454, 0.2817171514034271]",1.528939,0.517653,True
199,Anybody else out of Covid-19 test in the hospital? We're told there is a national shortage...,Fake,"[0.10680346190929413, 0.8752534985542297, 0.017943084239959717]",0.616977,0.517653,True
141,Has anyone heard of this study or it’s results?,Fake,"[0.12313830107450485, 0.8281078934669495, 0.048753827810287476]",0.809902,0.517653,True
6,Is the Pfizer vaccine truly been approved? I keep hearing people say it’s not really approved and that a different vaccine is approved but not available yet. Anyone hearing this? What are they talking about? So tired of all the false information. Thanks.,Fake,"[0.31345051527023315, 0.6571012735366821, 0.02944825030863285]",1.072463,0.517653,True
185,"[A new study finds no evidence of benefit from a malaria drug widely promoted as a treatment for coronavirus infection. Hydroxychloroquine did not lower the risk of dying or needing a breathing tube in a comparison that involved nearly 1,400 patients treated at Columbia University in New York, r...",Fake,"[0.3928290009498596, 0.4414222240447998, 0.1657487452030182]",1.480095,0.517653,True
385,"Good news for healthcare workers! Costco announced a new “priority access” policy for frontline workers, specifically first responders and healthcare workers, which allows them to skip straight to “the front of any line to enter the warehouse.” All you need to do is show your work ID to cut the ...",Fake,"[0.47324714064598083, 0.5234177112579346, 0.003335219109430909]",1.027088,0.517653,True


In [16]:
import pandas as pd

pd.options.display.max_colwidth = 300

columns_to_keep = [
    "text", "predict_1", "probability_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"
]

test_results_df.loc[
    (test_results_df["predict_1"] == "Real") & (test_results_df["passed_threshold_1"] == True), 
    columns_to_keep
].sample(10)

Unnamed: 0,text,predict_1,probability_1,entropy_1,entropy_threshold_1,passed_threshold_1
254,BRITAIN Cleaners at the Ministry of Justice in London have downed tools because two of their colleagues have died from suspected covid and they do not have proper protection.,Real,"[0.7346838712692261, 0.2520679235458374, 0.013248229399323463]",0.910574,0.688707,True
64,Health care workers! We are working our tails off! Let’s show some appreciation. Post a picture of yourself or a #covidhero in their PPE❤️ stay safe 😷🩺❤️,Real,"[0.5208881497383118, 0.46961653232574463, 0.009495341219007969]",1.066018,0.688707,True
311,"Half a million workers in Karachi, Pakistan, one of the world's largest megacities, have been illegally fired from their jobs during the COVID19 lockdown. In many cases across the country, factories are forcing their workers (who are often illiterate) to sign a document resigning, while in other...",Real,"[0.48944786190986633, 0.3792383372783661, 0.13131378591060638]",1.419603,0.688707,True
438,New study on the spread of Covid. Edit to add: 1. Im not pro or con on the study. Im simply sharing it. 2. It was conducted by real epidemiologists and public health experts at Princeton and Johns Hopkins and Berkley in addition to other notable international institutes. 3. Not everything posted...,Real,"[0.46576693654060364, 0.11471937596797943, 0.41951361298561096]",1.397526,0.688707,True
26,"SOUTH AFRICA. ""Workers use refuse bags as masks"". Health workers union threatens national strike. [https://www.thesouthafrican.com/news/nehawu-strike-over-lack-of-ppe-21-august-2020/](https://www.thesouthafrican.com/news/nehawu-strike-over-lack-of-ppe-21-august-2020/)",Real,"[0.7798680067062378, 0.2015141397714615, 0.018617866560816765]",0.852446,0.688707,True
77,Just a polite reminder if anyone has made anything bags ear savers. Could you please wash in 60 degrees and seal in food bags with the date sealed just ploughing through lots of bags that have been delivered When? washed and not in bags so have washed and ironed and now putting bags thankyou Jen...,Real,"[0.44291776418685913, 0.1333048939704895, 0.42377740144729614]",1.432823,0.688707,True
320,"Nurses’ Pleas Spur U.S. Pledge to Tap 44 Million-Mask Stockpile Hospitals and medical offices are once again running short on masks and gowns in the midst of a raging pandemic, but the federal government is still drawing up plans to distribute its swelling stockpile.",Real,"[0.6369713544845581, 0.36096030473709106, 0.0020683177281171083]",0.963564,0.688707,True
52,"“The Federation of State Medical Boards (FSMB), which supports its member state medical licensing boards, has recently issued a statement saying that providing misinformation about the COVID-19 vaccine contradicts physicians’ ethical and professional responsibilities, and therefore may subject a...",Real,"[0.6012201905250549, 0.3976283073425293, 0.001151527278125286]",0.981605,0.688707,True
49,Keeping everyone of you in my prayers. Please know that your service to others at this tragic and chaotic time is what His Holiness always talks about. We are all extremely proud of you 🙏🏽💪🏼👍🏼,Real,"[0.7494724988937378, 0.18610769510269165, 0.06441985070705414]",1.018146,0.688707,True
421,Noting that our sickest patients seem to be largely Hispanic males between 40-60. This is from Texas. Any thoughts?,Real,"[0.36942723393440247, 0.31001216173171997, 0.32056060433387756]",1.580677,0.688707,True


In [17]:
test_results_df['predict_1'].value_counts()

predict_1
Comments    268
Fake         56
Real         31
Name: count, dtype: int64

In [18]:
test_results_df.to_excel(DATA_PATH / 'facebook_data_to_complate.xlsx', index=False)