# Обучение определения фейковых фактов о COVID и вакцинации

In [168]:
import math

import torch
import pandas as pd
import numpy as np

In [169]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [170]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [171]:
# MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_complate.xlsx"

MODEL_NAME = "roberta-base"
# TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 16

# Датасет

In [172]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME).reset_index(drop=True)

In [173]:
# data_df = data_df.dropna(how='all')
data_df = data_df.sample(100)

data_df['text'] = data_df['text'].astype(str)
data_df = data_df.dropna(subset='predict_1')
data_df["truncated_text"] = data_df["text"].str[:200]

# Добавляем колонку id (используем индекс датафрейма)
data_df["id"] = data_df.index

In [174]:
data_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text,predict_1,probability_1,predict_2,probability_2,truncated_text,id
3917,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,107869.0,,2020-11-28 03:37:03 EST,...,,1.67,"Everyone's talking about the Covid vaccine.... Why aren't more people looking into MMR boosters to increase Covid resistance? Yet ANOTHER study to support a theory I've previously discussed...Journal of microbiology also published their own findings and an Epidemiology journal has looked at why Marshall Islands, American Samoa, Vanuvetu, Malaysia and Hong Kong all have little to NO COVID after major MMR vaccination campaigns in the last decade. Don't let big pharma push a vaccine at you that...",Analysis of Measles-Mumps-Rubella (MMR) Titers of Recovered COVID-19 Patients The measles-mumps-rubella (MMR) vaccine has been theorized to provide protection against coronavirus disease 2019 (COVID-19). Our aim was to determine whether any MMR IgG titers are inversely correlated with severity in recovered COVID-19 patients previously vaccinated with MMR II. We divided 80 sub...,Comments,"[0.07153145968914032, 0.16210079193115234, 0.7663677930831909]",Fake,"[0.2917669713497162, 0.6962428689002991, 0.011990156024694443]",Everyone's talking about the Covid vaccine.... Why aren't more people looking into MMR boosters to increase Covid resistance? Yet ANOTHER study to support a theory I've previously discussed...Journal,3917


In [175]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76 entries, 3917 to 788
Data columns (total 48 columns):
 #   Column                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                              --------------  -----  
 0   Group Name                                                                                                          76 non-null     object 
 1   User Name                                                                                                           0 non-null      float64
 2   Facebook Id                                                                                                         76 non-null     int64  
 3   Page Category                                                                                                       76 non-null     object 
 4   Page Admin Top Country 

In [176]:
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

# Модель

In [177]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, cache_dir=DATA_CACHE)


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [178]:
from transformers import RobertaModel

model = RobertaModel.from_pretrained(
    MODEL_NAME, cache_dir=DATA_CACHE)

model.to(DEVICE)
model.eval()
DEVICE

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'cpu'

## Получение эмбедингов

In [179]:
import torch
import numpy as np
from typing import List
from tqdm import tqdm

def add_text_embeddings(
    texts: List[str], 
    strategy: str = "cls",
    max_length: int = 128, 
    batch_size: int = 64
) -> np.ndarray:
    """
    Генерирует эмбеддинги для списка текстов.

    Параметры:
    - texts: список текстов для обработки.
    - strategy: метод усреднения токенов ("mean", "cls", "max", "sum").
    - max_length: максимальная длина токенов.
    - batch_size: размер батча для обработки.

    Возвращает:
    - Массив NumPy с эмбеддингами (shape: [num_texts, embedding_dim]).
    """
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        inputs = tokenizer(
            texts[i:i + batch_size], padding=True, truncation=True, 
            max_length=max_length, return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**inputs)
        
        hidden_states = outputs.last_hidden_state

        if strategy == "mean":
            batch_embeddings = hidden_states.mean(dim=1)
        elif strategy == "cls":
            batch_embeddings = hidden_states[:, 0, :]
        elif strategy == "max":
            batch_embeddings, _ = hidden_states.max(dim=1)
        elif strategy == "sum":
            batch_embeddings = hidden_states.sum(dim=1)
        else:
            raise ValueError("Некорректная стратегия. Выберите из ['mean', 'cls', 'max', 'sum'].")
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)


In [180]:
embeddings = add_text_embeddings(data_df['text'].to_list(), strategy="cls", max_length=MAX_LENGTH, batch_size=BATCH_SIZE)
import pickle

with open(DATA_PATH / 'facebook_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

Processing batches: 100%|██████████| 5/5 [00:04<00:00,  1.22it/s]


In [181]:
import pickle

with open(DATA_PATH / 'facebook_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

## Уменьшение размерности

## Общие методы

In [194]:
import numpy as np
import pandas as pd
import umap
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import Literal, Optional
from scipy.spatial import cKDTree


idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}


label2idx = {v: k for k, v in idx2label.items()}

class EmbeddingVisualizer:
    def __init__(self, embeddings: np.ndarray, data_df: pd.DataFrame):
        """
        Универсальный класс для снижения размерности эмбеддингов и визуализации.

        :param embeddings: Массив эмбеддингов (shape: [num_samples, embedding_dim])
        :param data_df: DataFrame с дополнительными данными (например, text, predict_1, id)
        """
        self.embeddings = embeddings
        self.data_df = data_df.copy()
        self.reduced_embeddings = None
    
    def reduce_dimensionality(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2):
        """
        Снижает размерность эмбеддингов с помощью указанного метода.
        
        :param method: Метод снижения размерности ('pca', 'tsne', 'umap')
        :param n_components: Количество целевых измерений (2 или 3 для визуализации)
        """
        if method == "pca":
            reducer = PCA(n_components=n_components)
        elif method == "tsne":
            reducer = TSNE(n_components=n_components, perplexity=30, random_state=42)
        elif method == "umap":
            reducer = umap.UMAP(n_components=n_components, random_state=42)
        else:
            raise ValueError("Неподдерживаемый метод. Используйте 'pca', 'tsne' или 'umap'.")
        
        self.reduced_embeddings = reducer.fit_transform(self.embeddings)
        for i in range(n_components):
            self.data_df[f"{method}_{i+1}"] = self.reduced_embeddings[:, i]
    
    def compute_opacity(self, points: np.ndarray, radius: float = 0.1) -> np.ndarray:
        """
        Рассчитывает прозрачность точек на основе их плотности.
        Теперь более плотные точки ярче, а разреженные — тусклее.
        
        :param points: Массив координат (Nx2 или Nx3)
        :param radius: Радиус для расчета плотности точек
        :return: Массив значений прозрачности (0.3 - 1.0)
        """
        tree = cKDTree(points)
        densities = np.array([len(tree.query_ball_point(p, radius)) for p in points])
        min_density = np.min(densities)
        max_density = np.max(densities)
        opacities = 0.3 + (densities - min_density) / (max_density - min_density) * 0.7  # Инвертируем прозрачность
        return np.clip(opacities, 0.3, 1.0)
    
    def visualize(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2, title: str = "Embedding Visualization", label2idx=label2idx):
        """
        Визуализирует сниженные эмбеддинги с интерактивными точками в Plotly.
        
        :param method: Метод, использованный для снижения размерности ('pca', 'tsne', 'umap')
        :param n_components: Размерность визуализации (2D или 3D)
        :param title: Заголовок графика
        """
        if self.reduced_embeddings is None or f"{method}_1" not in self.data_df.columns:
            raise ValueError("Сначала вызовите reduce_dimensionality().")
        
        self.data_df["predict_idx"] = self.data_df["predict_1"].map(label2idx)
        coords = self.reduced_embeddings[:, :n_components]
        opacities = self.compute_opacity(coords)
        
        color_mapping = {0: "blue", 1: "red", 2: "green"}
        
        fig = go.Figure()
        
        if n_components == 2:
            for idx, label in idx2label.items():
                subset = self.data_df[self.data_df["predict_idx"] == idx]
                opacity_subset = opacities[self.data_df["predict_idx"] == idx]

                fig.add_trace(go.Scatter(
                    x=subset[f"{method}_1"],
                    y=subset[f"{method}_2"],
                    mode="markers",
                    marker=dict(
                        size=8,
                        # opacity=opacity_subset,
                        color=color_mapping[idx]
                    ),
                    name=f"{label}",
                    customdata=subset[["truncated_text", "id", "predict_1"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>" 
                        "<b>ID:</b> %{customdata[1]}<br>"
                        "<b>Predict:</b> %{customdata[2]}"
                    )
                ))
        else:
            for idx, label in idx2label.items():
                subset = self.data_df[self.data_df["predict_idx"] == idx]
                # opacity_subset = opacities[self.data_df["predict_idx"] == idx]

                fig.add_trace(go.Scatter3d(
                    x=subset[f"{method}_1"],
                    y=subset[f"{method}_2"],
                    z=subset[f"{method}_3"],
                    mode="markers",
                    marker=dict(
                        size=5,
                        # opacity=opacity_subset,
                        color=color_mapping[idx]
                    ),
                    name=f"{label}",
                    customdata=subset[["truncated_text", "id", "predict_1"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>" 
                        "<b>ID:</b> %{customdata[1]}<br>"
                        "<b>Predict:</b> %{customdata[2]}"
                    )
                ))
        
        fig.update_layout(
            title=title,
            xaxis=dict(
                title=f"{method.upper()} 1 →",  # Добавляем стрелку
                showline=True,  # Добавляем рамку
                linewidth=2,
                linecolor="black",
                mirror=True,  # Рамка вокруг всей области графика
                gridcolor="lightgray",
                gridwidth=0.5,  # Делаем сетку тоньше
                zeroline=True,  # Ось X проходит через 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            yaxis=dict(
                title=f"{method.upper()} 2 →",  # Добавляем стрелку
                showline=True,
                linewidth=2,
                linecolor="black",
                mirror=True,
                gridcolor="lightgray",
                gridwidth=0.5,
                zeroline=True,  # Ось Y проходит через 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            template="plotly_white",
            width=1400,
            height=1000,
            legend_title="Class Labels"
        )

        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis=dict(
                    title=f"{method.upper()} 1 →",  # Добавляем стрелку
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                yaxis=dict(
                    title=f"{method.upper()} 2 →",  # Добавляем стрелку
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                zaxis=dict(
                    title=f"{method.upper()} 3 →",  # Добавляем стрелку
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                )
            ))



        
        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis_title=f"{method.upper()} 1",
                yaxis_title=f"{method.upper()} 2",
                zaxis_title=f"{method.upper()} 3"
            ))
        
        
        
        fig.show()


## Анализ

In [195]:
visualizer = EmbeddingVisualizer(embeddings, data_df)


### PCA

In [196]:

visualizer.reduce_dimensionality("pca", n_components=2)
visualizer.visualize("pca", n_components=2, title="PCA 2D Visualization")


In [185]:
visualizer.reduce_dimensionality("pca", n_components=3)
visualizer.visualize("pca", n_components=3, title="PCA 3D Visualization")

### TSNE


In [186]:

visualizer.reduce_dimensionality("tsne", n_components=2)
visualizer.visualize("tsne", n_components=2, title="t-SNE 2D Visualization")


In [187]:
visualizer.reduce_dimensionality("tsne", n_components=3)
visualizer.visualize("tsne", n_components=3, title="t-SNE 3D Visualization")

### UMAP

In [188]:
visualizer.reduce_dimensionality("umap", n_components=2)
visualizer.visualize("umap", n_components=2, title="UMAP 2D Visualization")


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [189]:
visualizer.reduce_dimensionality("umap", n_components=3)
visualizer.visualize("umap", n_components=3, title="UMAP 3D Visualization")


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Кластеризация

In [190]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Диапазон количества кластеров для тестирования
cluster_range = range(2, 15)
inertia_values = []
silhouette_scores = []

# Тестируем K-Means с разным числом кластеров
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(data_df[["pca_1", "pca_2"]])
    
    inertia_values.append(kmeans.inertia_)
    silhouette_avg = silhouette_score(data_df[["pca_1", "pca_2"]], cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Определяем оптимальное количество кластеров по максимальному силуэтному коэффициенту
optimal_k = cluster_range[np.argmax(silhouette_scores)]

# Применяем K-Means с оптимальным количеством кластеров
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
data_df["cluster"] = kmeans.fit_predict(data_df[["pca_1", "pca_2"]])

KeyError: "None of [Index(['pca_1', 'pca_2'], dtype='object')] are in the [columns]"

In [None]:
import plotly.graph_objects as go

# Создаем интерактивный график для метода локтя (Elbow Method)
fig = go.Figure()

# Добавляем линию инерции (метод локтя)
fig.add_trace(go.Scatter(
    x=list(cluster_range),
    y=inertia_values,
    mode="lines+markers",
    name="Инерция (Elbow Method)",
    marker=dict(size=8)
))

fig.update_layout(
    title="Метод локтя: Оптимальное количество кластеров",
    xaxis_title="Количество кластеров (k)",
    yaxis_title="Инерция",
    template="plotly_white"
)

fig.show()

# Создаем интерактивный график для коэффициента силуэта (Silhouette Score)
fig_silhouette = go.Figure()

fig_silhouette.add_trace(go.Scatter(
    x=list(cluster_range),
    y=silhouette_scores,
    mode="lines+markers",
    name="Коэффициент силуэта",
    marker=dict(size=8, color="red")
))

fig_silhouette.update_layout(
    title="Оптимальное количество кластеров (Silhouette Score)",
    xaxis_title="Количество кластеров (k)",
    yaxis_title="Коэффициент силуэта",
    template="plotly_white"
)

fig_silhouette.show()


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, IntSlider
from sklearn.cluster import KMeans

# Функция для кластеризации и визуализации с выбором k
def visualize_clusters(k):
    # Применяем K-Means с выбранным количеством кластеров
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    data_df["cluster"] = kmeans.fit_predict(data_df[["pca_1", "pca_2"]])
    
    # Получаем центры кластеров
    centers = kmeans.cluster_centers_
    
    # Создаем интерактивный scatter plot с кластерами
    fig = px.scatter(
        data_df, x="pca_1", y="pca_2", 
        color=data_df["cluster"].astype(str),
        title=f"K-Means Кластеризация (k={k})",
        labels={"pca_1": "PCA 1", "pca_2": "PCA 2", "color": "Кластер"},
        hover_data=["id", "truncated_text", "predict_1"]
    )
    
    # Добавляем центры кластеров на график
    fig.add_trace(go.Scatter(
        x=centers[:, 0],
        y=centers[:, 1],
        mode="markers",
        marker=dict(size=12, color="black", symbol="x"),
        name="Центры кластеров"
    ))

    # **Переносим изменение размера перед fig.show()**
    fig.update_layout(
        width=1500,
        height=1200,
        template="plotly_white",  # Улучшенный стиль
        xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor="lightgray"),  # Мелкая сетка
        yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor="lightgray")
    )

    fig.show()

# Создаем слайдер для выбора количества кластеров
interact(visualize_clusters, k=IntSlider(min=2, max=15, step=1, value=5, description="Количество кластеров"))


### PCA 3D

In [265]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA


if isinstance(data_df["embedding"].iloc[0], str):
    data_df["embedding"] = data_df["embedding"].apply(eval)

embeddings = np.array(data_df["embedding"].tolist())

n_components = 3
pca = PCA(n_components=n_components)
reduced_embeddings = pca.fit_transform(embeddings)

data_df["pca_1"] = reduced_embeddings[:, 0]
data_df["pca_2"] = reduced_embeddings[:, 1]
data_df["pca_3"] = reduced_embeddings[:, 2]

In [None]:
import plotly.graph_objects as go

# Создаем 3D scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter3d(
    x=data_df["pca_1"], 
    y=data_df["pca_2"], 
    z=data_df["pca_3"],
    mode="markers",
    marker=dict(size=6, opacity=0.7),
    customdata=data_df[["truncated_text", "predict_1", "id"]],  # Передаем обрезанный text, predict_1 и id
    hovertemplate="<b>Text:</b> %{customdata[0]}<br>"  # Отображаем обрезанный text
                  "<b>Predict:</b> %{customdata[1]}<br>"  # Отображаем predict_1
                  "<b>Id:</b> %{customdata[2]}"  # Теперь ID будет на новой строке
))

fig.update_layout(
    title="3D PCA с интерактивными точками",
    scene=dict(
        xaxis_title="PCA 1",
        yaxis_title="PCA 2",
        zaxis_title="PCA 3"
    ),
    template="plotly_white"
)

fig.update_layout(
    width=1500,
    height=1000,
)

fig.show()


### K-mean

In [267]:
# Адаптация кода под 3D кластеризацию

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Диапазон количества кластеров для тестирования
cluster_range = range(2, 15)
inertia_values = []
silhouette_scores = []

# Тестируем K-Means с разным числом кластеров на 3D PCA
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(data_df[["pca_1", "pca_2", "pca_3"]])
    
    inertia_values.append(kmeans.inertia_)
    
    # Проверяем, достаточно ли данных для расчета силуэтного коэффициента
    if len(set(cluster_labels)) > 1:  
        silhouette_avg = silhouette_score(data_df[["pca_1", "pca_2", "pca_3"]], cluster_labels)
        silhouette_scores.append(silhouette_avg)
    else:
        silhouette_scores.append(-1)

# Определяем оптимальное количество кластеров по максимальному силуэтному коэффициенту
optimal_k = cluster_range[np.argmax(silhouette_scores)]

# Применяем K-Means с оптимальным количеством кластеров
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
data_df["cluster"] = kmeans.fit_predict(data_df[["pca_1", "pca_2", "pca_3"]])


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, IntSlider
from sklearn.cluster import KMeans

# Функция для кластеризации и визуализации 3D с выбором k
def visualize_clusters_3d(k):
    # Применяем K-Means с выбранным количеством кластеров
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    data_df["cluster"] = kmeans.fit_predict(data_df[["pca_1", "pca_2", "pca_3"]])
    
    # Получаем центры кластеров
    centers = kmeans.cluster_centers_
    
    # Создаем интерактивный 3D scatter plot с кластерами
    fig = px.scatter_3d(
        data_df, x="pca_1", y="pca_2", z="pca_3",
        color=data_df["cluster"].astype(str),
        title=f"K-Means 3D Кластеризация (k={k})",
        labels={"pca_1": "PCA 1", "pca_2": "PCA 2", "pca_3": "PCA 3", "color": "Кластер"},
        hover_data=["id", "truncated_text", "predict_1"]
    )
    
    # Добавляем центры кластеров на график
    fig.add_trace(go.Scatter3d(
        x=centers[:, 0],
        y=centers[:, 1],
        z=centers[:, 2],
        mode="markers",
        marker=dict(size=4, color="black", symbol="x"),
        name="Центры кластеров"
    ))

    # **Исправлено: обновляем стиль перед show()**
    fig.update_layout(
        width=1500,
        height=1200,
        template="plotly_white",  # Улучшенный стиль
        scene=dict(
            xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor="lightgray"),
            yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor="lightgray"),
            zaxis=dict(showgrid=True, gridwidth=0.5, gridcolor="lightgray")
        )
    )

    fig.show()

# Создаем слайдер для выбора количества кластеров
interact(visualize_clusters_3d, k=IntSlider(min=2, max=15, step=1, value=5, description="Количество кластеров"))
