In [14]:
import math

import torch
import pandas as pd
import numpy as np

In [15]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [16]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [17]:
MODEL_NAME = "roberta-base"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 16

# Dataset

In [18]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME).reset_index(drop=True)

In [19]:
data_df = data_df.dropna(how='all')
data_df['text'] = data_df['text'].astype(str)
data_df = data_df[~data_df['text'].isin([None, 'none', 'nan']) & data_df['text'].notna() & (data_df['text'] != '')]
data_df["truncated_text"] = data_df["text"].str[:200]
data_df["id"] = data_df.index

In [20]:
# data_df = data_df.sample(100)

In [21]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4892 entries, 0 to 6858
Data columns (total 44 columns):
 #   Column                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                              --------------  -----  
 0   Group Name                                                                                                          4892 non-null   object 
 1   User Name                                                                                                           0 non-null      float64
 2   Facebook Id                                                                                                         4892 non-null   int64  
 3   Page Category                                                                                                       4892 non-null   object 
 4   Page Admin Top Country 

# Model

In [22]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, cache_dir=DATA_CACHE)



In [23]:
from transformers import RobertaModel

model = RobertaModel.from_pretrained(
    MODEL_NAME, cache_dir=DATA_CACHE)

model.to(DEVICE)
model.eval()
DEVICE

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'cpu'

## Получение эмбедингов

In [24]:
import torch
import numpy as np
from typing import List
from tqdm import tqdm

def add_text_embeddings(
    texts: List[str], 
    strategy: str = "cls",
    max_length: int = 128, 
    batch_size: int = 64
) -> np.ndarray:
    """
    Generates embeddings for a list of texts.

    Parameters:
    - texts: list of texts to process.
    - strategy: token averaging method ("mean", "cls", "max", "sum").
    - max_length: maximum token length.
    - batch_size: batch size for processing.

    Returns:
    - NumPy array of embeddings (shape: [num_texts, embedding_dim]).
    """
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        inputs = tokenizer(
            texts[i:i + batch_size], padding=True, truncation=True, 
            max_length=max_length, return_tensors="pt"
        )
        inputs.to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
        
        hidden_states = outputs.last_hidden_state

        if strategy == "mean":
            batch_embeddings = hidden_states.mean(dim=1)
        elif strategy == "cls":
            batch_embeddings = hidden_states[:, 0, :]
        elif strategy == "max":
            batch_embeddings, _ = hidden_states.max(dim=1)
        elif strategy == "sum":
            batch_embeddings = hidden_states.sum(dim=1)
        else:
            raise ValueError("Invalid strategy. Choose from ['mean', 'cls', 'max', 'sum'].")
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)


In [25]:
embeddings = add_text_embeddings(data_df['text'].to_list(), strategy="cls", max_length=MAX_LENGTH, batch_size=BATCH_SIZE)
import pickle

with open(DATA_PATH / 'facebook_embeddings_roberta.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

Processing batches: 100%|██████████| 306/306 [05:21<00:00,  1.05s/it]


In [26]:
import pickle

with open(DATA_PATH / 'facebook_embeddings_roberta.pkl', 'rb') as f:
    embeddings = pickle.load(f)

## Dimensionality Reduction

In [43]:
import numpy as np
import pandas as pd
import umap
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from typing import Literal, Optional, Dict, Any
from scipy.spatial import cKDTree


class EmbeddingVisualizer:
    def __init__(self, embeddings: np.ndarray, data_df: pd.DataFrame):
        """
        A universal class for dimensionality reduction of embeddings and visualization.

        :param embeddings: Embedding array (shape: [num_samples, embedding_dim])
        :param data_df: DataFrame with additional data (e.g., text, predict_1, id)
        """
        self.embeddings = embeddings
        self.data_df = data_df.copy()
        self.reduced_embeddings = None
        self.cluster_labels = None
    
    def reduce_dimensionality(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2):
        """
        Reduces the dimensionality of the embeddings using the specified method.
        
        :param method: Dimensionality reduction method ('pca', 'tsne', 'umap')
        :param n_components: Number of target dimensions (2 or 3 for visualization)
        """
        if method == "pca":
            reducer = PCA(n_components=n_components)
        elif method == "tsne":
            reducer = TSNE(n_components=n_components, perplexity=30, random_state=42)
        elif method == "umap":
            reducer = umap.UMAP(n_components=n_components, random_state=42)
        else:
            raise ValueError("Unsupported method. Use 'pca', 'tsne', or 'umap'.")
        
        self.reduced_embeddings = reducer.fit_transform(self.embeddings)
        for i in range(n_components):
            self.data_df[f"{method}_{i+1}"] = self.reduced_embeddings[:, i]
    
    def cluster_data(self, method: Literal["kmeans", "dbscan", "agglomerative", "spectral"], **kwargs: Dict[str, Any]):
        """
        Applies clustering to the reduced embeddings.

        :param method: Clustering method ('kmeans', 'dbscan', 'agglomerative', 'spectral')
        :param kwargs: Additional parameters for clustering algorithms
        """
        if self.reduced_embeddings is None:
            raise ValueError("Run reduce_dimensionality() before clustering.")

        if method == "kmeans":
            model = KMeans(n_clusters=kwargs.get("n_clusters", 3), random_state=42)
        elif method == "dbscan":
            model = DBSCAN(eps=kwargs.get("eps", 0.5), min_samples=kwargs.get("min_samples", 5))
        elif method == "agglomerative":
            model = AgglomerativeClustering(n_clusters=kwargs.get("n_clusters", 3))
        elif method == "spectral":
            model = SpectralClustering(n_clusters=kwargs.get("n_clusters", 3), random_state=42, assign_labels="discretize")
        else:
            raise ValueError("Unsupported clustering method.")

        self.cluster_labels = model.fit_predict(self.reduced_embeddings)
        self.data_df["cluster"] = self.cluster_labels
    
    def compute_opacity(self, points: np.ndarray, radius: float = 0.1) -> np.ndarray:
        """
        Computes the opacity of points based on their density.
        Now denser points are brighter, and sparse points are dimmer.
        
        :param points: Array of coordinates (Nx2 or Nx3)
        :param radius: Radius for density calculation of points
        :return: Array of opacity values (0.3 - 1.0)
        """
        tree = cKDTree(points)
        densities = np.array([len(tree.query_ball_point(p, radius)) for p in points])
        min_density = np.min(densities)
        max_density = np.max(densities)
        opacities = 0.3 + (densities - min_density) / (max_density - min_density) * 0.7  # Inverts opacity
        return np.clip(opacities, 0.3, 1.0)
    
    def get_data_df(self) -> pd.DataFrame:
        return self.data_df
    
    def visualize(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2, use_clusters: bool = False, use_opacity: bool = True, title: str = "Embedding Visualization"):
        """
        Visualizes the reduced embeddings with interactive points in Plotly.

        :param method: Dimensionality reduction method used ('pca', 'tsne', 'umap')
        :param n_components: Number of dimensions for visualization (2D or 3D)
        :param use_clusters: If True, colors points by cluster labels
        :param use_opacity: If True, applies density-based opacity
        :param title: Plot title
        """

        if self.reduced_embeddings is None or f"{method}_1" not in self.data_df.columns:
            raise ValueError("Run reduce_dimensionality() first.")

        coords = self.reduced_embeddings[:, :n_components]
        opacities = self.compute_opacity(coords) if use_opacity else 1.0

        fig = go.Figure()

        if n_components == 2:
            if use_clusters and self.cluster_labels is not None:
                unique_clusters = np.unique(self.cluster_labels)

                for cluster in unique_clusters:
                    cluster_mask = self.cluster_labels == cluster
                    fig.add_trace(go.Scatter(
                        x=self.data_df.loc[cluster_mask, f"{method}_1"],
                        y=self.data_df.loc[cluster_mask, f"{method}_2"],
                        mode="markers",
                        marker=dict(
                            size=8,
                            opacity=opacities[cluster_mask] if use_opacity else 1.0
                        ),
                        name=f"Cluster {cluster}",
                        customdata=self.data_df.loc[cluster_mask, ["truncated_text", "id"]],
                        hovertemplate=(
                            "<b>Text:</b> %{customdata[0]}<br>"
                            "<b>ID:</b> %{customdata[1]}"
                        )
                    ))
            else:
                # Обычная визуализация (без кластеров)
                fig.add_trace(go.Scatter(
                    x=self.data_df[f"{method}_1"],
                    y=self.data_df[f"{method}_2"],
                    mode="markers",
                    marker=dict(
                        size=8,
                        opacity=opacities if use_opacity else 1.0,
                        color="blue"
                    ),
                    name="Post's text",
                    customdata=self.data_df[["truncated_text", "id"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>"
                        "<b>ID:</b> %{customdata[1]}"
                    )
                ))
        else:
            if use_clusters and self.cluster_labels is not None:
                unique_clusters = np.unique(self.cluster_labels)

                for cluster in unique_clusters:
                    cluster_mask = self.cluster_labels == cluster
                    fig.add_trace(go.Scatter3d(
                        x=self.data_df.loc[cluster_mask, f"{method}_1"],
                        y=self.data_df.loc[cluster_mask, f"{method}_2"],
                        z=self.data_df.loc[cluster_mask, f"{method}_3"],
                        mode="markers",
                        marker=dict(
                            size=5,
                            # opacity=opacities[cluster_mask] if use_opacity else 1.0
                        ),
                        name=f"Cluster {cluster}",
                        customdata=self.data_df.loc[cluster_mask, ["truncated_text", "id"]],
                        hovertemplate=(
                            "<b>Text:</b> %{customdata[0]}<br>"
                            "<b>ID:</b> %{customdata[1]}"
                        )
                    ))
            else:
                fig.add_trace(go.Scatter3d(
                    x=self.data_df[f"{method}_1"],
                    y=self.data_df[f"{method}_2"],
                    z=self.data_df[f"{method}_3"],
                    mode="markers",
                    marker=dict(
                        size=5,
                        # opacity=opacities if use_opacity else 1.0,
                        color="blue"
                    ),
                    name="Post's text",
                    customdata=self.data_df[["truncated_text", "id"]],
                    hovertemplate=(
                        "<b>Text:</b> %{customdata[0]}<br>"
                        "<b>ID:</b> %{customdata[1]}"
                    )
                ))


        fig.update_layout(
            title=title,
            xaxis=dict(
                title=f"{method.upper()} 1 →",
                showline=True,
                linewidth=2,
                linecolor="black",
                mirror=True,
                gridcolor="lightgray",
                gridwidth=0.5,
                zeroline=True,
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            yaxis=dict(
                title=f"{method.upper()} 2 →",
                showline=True,
                linewidth=2,
                linecolor="black",
                mirror=True,
                gridcolor="lightgray",
                gridwidth=0.5,
                zeroline=True,
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            template="plotly_white",
            width=1400,
            height=1000,
            legend_title="Clusters" if use_clusters else "Embedding Visualization"
        )

        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis_title=f"{method.upper()} 1",
                yaxis_title=f"{method.upper()} 2",
                zaxis_title=f"{method.upper()} 3"
            ))

        fig.show()


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec


def generate_wordcloud(data_df: pd.DataFrame, cluster: int, method: Literal["bow", "word2vec"] = "bow", word2vec_model: Optional[Word2Vec] = None):
    """
    Генерирует облако слов для заданного кластера.

    :param data_df: DataFrame с колонками ["text", "cluster"]
    :param cluster: Номер кластера для фильтрации
    :param method: "bow" (мешок слов) или "word2vec" (поиск ближайших слов)
    :param word2vec_model: Предобученная модель Word2Vec (если метод "word2vec")
    """
    # Фильтруем тексты по кластеру
    cluster_texts = data_df[data_df["cluster"] == cluster]["text"].dropna().tolist()

    if not cluster_texts:
        print(f"Нет текстов в кластере {cluster}.")
        return

    if method == "bow":
        # Используем мешок слов для частотного анализа
        vectorizer = CountVectorizer(stop_words="english")
        word_counts = vectorizer.fit_transform(cluster_texts)
        word_freq = dict(zip(vectorizer.get_feature_names_out(), word_counts.toarray().sum(axis=0)))
    
    elif method == "word2vec":
        if word2vec_model is None:
            raise ValueError("Для метода 'word2vec' необходимо передать обученную модель Word2Vec.")

        # Разделяем тексты на слова
        words = [word for text in cluster_texts for word in text.split()]

        # Получаем список похожих слов с частотами
        word_freq = Counter()
        for word in words:
            if word in word2vec_model.wv:
                similar_words = word2vec_model.wv.most_similar(word, topn=5)
                for sim_word, score in similar_words:
                    word_freq[sim_word] += score

    else:
        raise ValueError("Метод должен быть 'bow' или 'word2vec'.")

    # Создаем облако слов
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)

    # Визуализируем
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Облако слов для кластера {cluster} ({method})")
    plt.show()


## **Параметры `visualize()`**

- **`method: Literal["pca", "tsne", "umap"]`**  
  Метод снижения размерности, который был использован перед визуализацией.  
  - `"pca"` – Метод главных компонент (быстрый, но линейный).  
  - `"tsne"` – t-SNE (хорошо для кластеризации, но медленный).  
  - `"umap"` – UMAP (быстрее t-SNE, но сохраняет больше глобальной структуры).  

- **`n_components: int = 2`**  
  Количество измерений визуализации:  
  - `2` – Отображение в 2D.  
  - `3` – Отображение в 3D.  

- **`use_clusters: bool = False`**  
  Определяет, будут ли точки окрашены по кластерам или нет.  
  - `True` – Цвета точек соответствуют кластерам.  
  - `False` – Цвета точек соответствуют синему цвету.  

- **`use_opacity: bool = True`**  
  Управляет прозрачностью точек на основе плотности распределения.  
  - `True` – Плотные кластеры ярче, разреженные тусклее.  
  - `False` – Все точки одинаково непрозрачны.  


- **`title: str = "Embedding Visualization"`**  
  Заголовок графика. Можно передавать собственное название для различных экспериментов.

## **Параметры `cluster_data()`**

- **`method: Literal["kmeans", "dbscan", "agglomerative", "spectral"]`**  
  Метод кластеризации, который будет применен к уменьшенным эмбеддингам.  
  - `"kmeans"` – K-средних (хорош для плотных, сферических кластеров).  
  - `"dbscan"` – DBSCAN (может находить кластеры разной формы, работает с шумными данными).  
  - `"agglomerative"` – Иерархическая агломеративная кластеризация (не требует указания количества кластеров).  
  - `"spectral"` – Спектральная кластеризация (основана на графах, подходит для сложных структур).  

- **`n_clusters: int = 3`** *(только для `"kmeans"`, `"agglomerative"`, `"spectral"`)*  
  Количество кластеров, которые нужно найти (если метод поддерживает фиксированное число кластеров).  
  - Используется в **`kmeans`**, **`agglomerative`**, **`spectral`**.  
  - Игнорируется в **`dbscan`**, так как он определяет кластеры автоматически.  

- **`eps: float = 0.5`** *(только для `"dbscan"`)*  
  Радиус окрестности для поиска точек в `DBSCAN`. Чем больше `eps`, тем больше точек попадает в кластеры.  
  - Оптимальный `eps` зависит от плотности данных.  

- **`min_samples: int = 5`** *(только для `"dbscan"`)*  
  Минимальное количество точек в группе для формирования кластера в `DBSCAN`.  

- **`random_state: int = 42`** *(по умолчанию, для `"kmeans"`, `"spectral"`, `"agglomerative"`)*  
  Фиксирует случайное состояние для воспроизводимости результатов.  

In [44]:
visualizer = EmbeddingVisualizer(embeddings, data_df)


### PCA

In [45]:
visualizer.reduce_dimensionality("pca", n_components=2)
visualizer.visualize("pca", n_components=2, title="PCA 2D Visualization no clusters", use_clusters=False, use_opacity=True)

In [46]:
visualizer.cluster_data("kmeans", n_clusters=50)
visualizer.visualize("pca", title="PCA 2D Visualization with clusters", use_clusters=True, use_opacity=True)

### TSNE


In [47]:
visualizer.reduce_dimensionality("tsne", n_components=2)
visualizer.visualize("tsne", n_components=2, title="t-SNE 2D Visualization no clusters", use_clusters=False, use_opacity=True)

In [48]:
visualizer.cluster_data("kmeans", n_clusters=50)
visualizer.visualize("tsne", title="t-SNE 2D Visualization with clusters", use_clusters=True, use_opacity=True)

### UMAP

In [49]:
visualizer.reduce_dimensionality("umap", n_components=2)
visualizer.visualize("umap", n_components=2, title="UMAP 2D Visualization no clusters", use_clusters=False, use_opacity=True)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [50]:
visualizer.cluster_data("kmeans", n_clusters=50)
visualizer.visualize("umap", title="UMAP 2D Visualization with clusters", use_clusters=True, use_opacity=True)