In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [3]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [4]:
MODEL_NAME = "covid_vaccine_fake_model"
TEST_DF_NAME = "facebook_data_to_complate.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 16

# Dataset

In [5]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME).reset_index(drop=True)

In [6]:
data_df = data_df.dropna(how='all')
data_df['text'] = data_df['text'].astype(str)
data_df = data_df.dropna(subset='predict_1')
data_df["truncated_text"] = data_df["text"].str[:200]
data_df["id"] = data_df.index

In [None]:
data_df.head(1)

In [None]:
data_df.info()

In [9]:
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

# Model

In [10]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(DATA_PATH_SAVE_MODELS / MODEL_NAME, cache_dir=DATA_CACHE)

In [None]:
from transformers import RobertaModel

model = RobertaModel.from_pretrained(
    DATA_PATH_SAVE_MODELS / MODEL_NAME, cache_dir=DATA_CACHE)

model.to(DEVICE)
model.eval()
DEVICE

## Embeddings

In [12]:
import torch
import numpy as np
from typing import List
from tqdm import tqdm

def add_text_embeddings(
    texts: List[str], 
    strategy: str = "cls",
    max_length: int = 128, 
    batch_size: int = 64
) -> np.ndarray:
    """
    Генерирует эмбеддинги для списка текстов.

    Параметры:
    - texts: список текстов для обработки.
    - strategy: метод усреднения токенов ("mean", "cls", "max", "sum").
    - max_length: максимальная длина токенов.
    - batch_size: размер батча для обработки.

    Возвращает:
    - Массив NumPy с эмбеддингами (shape: [num_texts, embedding_dim]).
    """
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        inputs = tokenizer(
            texts[i:i + batch_size], padding=True, truncation=True, 
            max_length=max_length, return_tensors="pt"
        )
        inputs.to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
        
        hidden_states = outputs.last_hidden_state

        if strategy == "mean":
            batch_embeddings = hidden_states.mean(dim=1)
        elif strategy == "cls":
            batch_embeddings = hidden_states[:, 0, :]
        elif strategy == "max":
            batch_embeddings, _ = hidden_states.max(dim=1)
        elif strategy == "sum":
            batch_embeddings = hidden_states.sum(dim=1)
        else:
            raise ValueError("Некорректная стратегия. Выберите из ['mean', 'cls', 'max', 'sum'].")
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)


In [None]:
embeddings = add_text_embeddings(data_df['text'].to_list(), strategy="cls", max_length=MAX_LENGTH, batch_size=BATCH_SIZE)
import pickle

with open(DATA_PATH / 'facebook_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [14]:
import pickle

with open(DATA_PATH / 'facebook_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

## Dimensionality Reduction

In [15]:
import numpy as np
import pandas as pd
import umap
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import Literal, Optional
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


# Dictionary to map indices to label names
idx2label = {
    0: "Real",
    1: "Fake",
    2: "Comments"
}

# Dictionary to map label names to indices
label2idx = {v: k for k, v in idx2label.items()}

class EmbeddingVisualizer:
    def __init__(self, embeddings: np.ndarray, data_df: pd.DataFrame):
        """
        A universal class for dimensionality reduction of embeddings and visualization.

        :param embeddings: Array of embeddings (shape: [num_samples, embedding_dim])
        :param data_df: DataFrame with additional data (e.g., text, predict_1, id)
        """
        self.embeddings = embeddings
        self.data_df = data_df.copy()
        self.reduced_embeddings = None
    
    def reduce_dimensionality(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2):
        """
        Reduces the dimensionality of the embeddings using the specified method.
        
        :param method: Dimensionality reduction method ('pca', 'tsne', 'umap')
        :param n_components: Number of target dimensions (2 or 3 for visualization)
        """
        if method == "pca":
            reducer = PCA(n_components=n_components)
        elif method == "tsne":
            reducer = TSNE(n_components=n_components, perplexity=30, random_state=42)
        elif method == "umap":
            reducer = umap.UMAP(n_components=n_components, random_state=42)
        else:
            raise ValueError("Unsupported method. Use 'pca', 'tsne', or 'umap'.")
        
        self.reduced_embeddings = reducer.fit_transform(self.embeddings)
        for i in range(n_components):
            self.data_df[f"{method}_{i+1}"] = self.reduced_embeddings[:, i]
    
    def compute_opacity(self, points: np.ndarray, radius: float = 0.1) -> np.ndarray:
        """
        Computes the opacity of points based on their density.
        Now, denser points are brighter, and sparse ones are dimmer.
        
        :param points: Array of coordinates (Nx2 or Nx3)
        :param radius: Radius for calculating point density
        :return: Array of opacity values (0.3 - 1.0)
        """
        tree = cKDTree(points)
        densities = np.array([len(tree.query_ball_point(p, radius)) for p in points])
        min_density = np.min(densities)
        max_density = np.max(densities)
        opacities = 0.3 + (densities - min_density) / (max_density - min_density) * 0.7  # Invert opacity
        return np.clip(opacities, 0.3, 1.0)
    
    def visualize(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2, title: str = "Embedding Visualization", label2idx=label2idx):
        """
        Visualizes the reduced embeddings with interactive points in Plotly.
        
        :param method: Method used for dimensionality reduction ('pca', 'tsne', 'umap')
        :param n_components: Visualization dimensionality (2D or 3D)
        :param title: Plot title
        """
        if self.reduced_embeddings is None or f"{method}_1" not in self.data_df.columns:
            raise ValueError("Call reduce_dimensionality() first.")
        
        self.data_df["predict_idx"] = self.data_df["predict_1"].map(label2idx)  # Map prediction labels to indices
        coords = self.reduced_embeddings[:, :n_components]  # Get reduced coordinates for plotting
        opacities = self.compute_opacity(coords)  # Compute opacities based on density
        
        # Color mapping for different labels
        color_mapping = {0: "blue", 1: "red", 2: "green"}
        
        fig = go.Figure()
        
        if n_components == 2:
            # Plot 2D scatter plot
            for idx, label in idx2label.items():
                subset = self.data_df[self.data_df["predict_idx"] == idx]  # Get subset for each label
                opacity_subset = opacities[self.data_df["predict_idx"] == idx]

                fig.add_trace(go.Scatter(
                    x=subset[f"{method}_1"],
                    y=subset[f"{method}_2"],
                    mode="markers",
                    marker=dict(
                        size=8,
                        color=color_mapping[idx],
                        opacity=opacity_subset,
                    ),
                    name=f"{label}",
                    customdata=subset[["truncated_text", "id", "predict_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>" 
                        "<b>ID:</b> %{customdata[1]}<br>"
                        "<b>Predict:</b> %{customdata[2]}<br>"
                        "<b>Entropy:</b> %{customdata[3]}<br>"
                        "<b>Entropy threshold:</b> %{customdata[4]}<br>"
                        "<b>Passed threshold:</b> %{customdata[5]}"
                    )
                ))
        else:
            # Plot 3D scatter plot
            for idx, label in idx2label.items():
                subset = self.data_df[self.data_df["predict_idx"] == idx]

                fig.add_trace(go.Scatter3d(
                    x=subset[f"{method}_1"],
                    y=subset[f"{method}_2"],
                    z=subset[f"{method}_3"],
                    mode="markers",
                    marker=dict(
                        size=5,
                        color=color_mapping[idx]
                    ),
                    name=f"{label}",
                    customdata=subset[["truncated_text", "id", "predict_1", "entropy_1", "entropy_threshold_1", "passed_threshold_1"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>" 
                        "<b>ID:</b> %{customdata[1]}<br>"
                        "<b>Predict:</b> %{customdata[2]}<br>"
                        "<b>Entropy:</b> %{customdata[3]}<br>"
                        "<b>Entropy threshold:</b> %{customdata[4]}<br>"
                        "<b>Passed threshold:</b> %{customdata[5]}"
                    )
                ))
        
        # Update layout for 2D visualization
        fig.update_layout(
            title=title,
            xaxis=dict(
                title=f"{method.upper()} 1 →",  # Add arrow
                showline=True,  # Add border
                linewidth=2,
                linecolor="black",
                mirror=True,  # Border around the entire plot area
                gridcolor="lightgray",
                gridwidth=0.5,  # Thinner grid
                zeroline=True,  # X axis goes through 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            yaxis=dict(
                title=f"{method.upper()} 2 →",  # Add arrow
                showline=True,
                linewidth=2,
                linecolor="black",
                mirror=True,
                gridcolor="lightgray",
                gridwidth=0.5,
                zeroline=True,  # Y axis goes through 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            template="plotly_white",
            width=1400,
            height=1000,
            legend_title="Class Labels"
        )

        # Update layout for 3D visualization
        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis=dict(
                    title=f"{method.upper()} 1 →",  # Add arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                yaxis=dict(
                    title=f"{method.upper()} 2 →",  # Add arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                zaxis=dict(
                    title=f"{method.upper()} 3 →",  # Add arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                )
            ))

        # Display the plot
        fig.show()


In [16]:
visualizer = EmbeddingVisualizer(embeddings, data_df)


### PCA

In [None]:

visualizer.reduce_dimensionality("pca", n_components=2)
visualizer.visualize("pca", n_components=2, title="PCA 2D Visualization")


In [18]:
# visualizer.reduce_dimensionality("pca", n_components=3)
# visualizer.visualize("pca", n_components=3, title="PCA 3D Visualization")

### TSNE


In [None]:

visualizer.reduce_dimensionality("tsne", n_components=2)
visualizer.visualize("tsne", n_components=2, title="t-SNE 2D Visualization")


In [20]:
# visualizer.reduce_dimensionality("tsne", n_components=3)
# visualizer.visualize("tsne", n_components=3, title="t-SNE 3D Visualization")

### UMAP

In [None]:
visualizer.reduce_dimensionality("umap", n_components=2)
visualizer.visualize("umap", n_components=2, title="UMAP 2D Visualization")

In [22]:
# visualizer.reduce_dimensionality("umap", n_components=3)
# visualizer.visualize("umap", n_components=3, title="UMAP 3D Visualization")