In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
DEVICE = 'cpu'
if torch.backends.mps.is_available():
    DEVICE = 'mps'
if torch.cuda.is_available():
    DEVICE = 'cuda'

In [3]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

In [4]:
MODEL_NAME = "roberta-base"
TEST_DF_NAME = "facebook_data_to_model.xlsx"

MAX_LENGTH = 128
BATCH_SIZE = 16

# Dataset

In [5]:
data_df = pd.read_excel(DATA_PATH / TEST_DF_NAME).reset_index(drop=True)

In [6]:
data_df = data_df.dropna(how='all')
data_df['text'] = data_df['text'].astype(str)
data_df = data_df[~data_df['text'].isin([None, 'none', 'nan']) & data_df['text'].notna() & (data_df['text'] != '')]
data_df["truncated_text"] = data_df["text"].str[:200]
data_df["id"] = data_df.index

In [7]:
# data_df = data_df.sample(100)

In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4966 entries, 0 to 6858
Data columns (total 44 columns):
 #   Column                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                              --------------  -----  
 0   Group Name                                                                                                          4966 non-null   object 
 1   User Name                                                                                                           0 non-null      float64
 2   Facebook Id                                                                                                         4966 non-null   int64  
 3   Page Category                                                                                                       4966 non-null   object 
 4   Page Admin Top Country 

# Model

In [9]:
from transformers import RobertaTokenizer

tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, cache_dir=DATA_CACHE)



In [10]:
from transformers import RobertaModel

model = RobertaModel.from_pretrained(
    MODEL_NAME, cache_dir=DATA_CACHE)

model.to(DEVICE)
model.eval()
DEVICE

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'cpu'

## Embeddings

In [21]:
from gensim.models import KeyedVectors
import numpy as np
from typing import List

def add_word2vec_embeddings(
    texts: List[str], 
    model: KeyedVectors, 
    strategy: str = "mean"
) -> np.ndarray:
    """
    Generates embeddings for a list of texts using a pre-trained Word2Vec model.

    Parameters:
    - texts: List of texts to process.
    - model: Pre-trained Word2Vec model (KeyedVectors).
    - strategy: Token aggregation strategy ("mean", "max", "sum").

    Returns:
    - NumPy array of embeddings (shape: [num_texts, embedding_dim]).
    """
    embeddings = []
    for text in texts:
        words = text.split()
        word_vectors = [model[word] for word in words if word in model]

        if not word_vectors:
            embedding = np.zeros(model.vector_size)
        else:
            word_vectors = np.array(word_vectors)
            if strategy == "mean":
                embedding = word_vectors.mean(axis=0)
            elif strategy == "max":
                embedding = word_vectors.max(axis=0)
            elif strategy == "sum":
                embedding = word_vectors.sum(axis=0)
            else:
                raise ValueError("Invalid strategy. Choose from ['mean', 'max', 'sum'].")

        embeddings.append(embedding)

    return np.array(embeddings)


In [22]:
import pickle

import gensim.downloader as api
from gensim.models import KeyedVectors

save_path = api.load("word2vec-google-news-300", return_path=True)
print("Model path:", save_path)

w2v_model = KeyedVectors.load_word2vec_format(save_path, binary=True)

embeddings = add_word2vec_embeddings(data_df['text'].to_list(), w2v_model, strategy="mean")

with open(DATA_PATH / 'facebook_embeddings_word2vec.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


Model path: C:\Users\anton/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


## Dimensionality Reduction

In [24]:
import numpy as np
import pandas as pd
import umap
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import Literal
from scipy.spatial import cKDTree



class EmbeddingVisualizer:
    def __init__(self, embeddings: np.ndarray, data_df: pd.DataFrame):
        """
        A universal class for dimensionality reduction of embeddings and visualization.

        :param embeddings: Embedding array (shape: [num_samples, embedding_dim])
        :param data_df: DataFrame with additional data (e.g., text, predict_1, id)
        """
        self.embeddings = embeddings
        self.data_df = data_df.copy()
        self.reduced_embeddings = None
    
    def reduce_dimensionality(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2):
        """
        Reduces the dimensionality of the embeddings using the specified method.
        
        :param method: Dimensionality reduction method ('pca', 'tsne', 'umap')
        :param n_components: Number of target dimensions (2 or 3 for visualization)
        """
        if method == "pca":
            reducer = PCA(n_components=n_components)
        elif method == "tsne":
            reducer = TSNE(n_components=n_components, perplexity=30, random_state=42)
        elif method == "umap":
            reducer = umap.UMAP(n_components=n_components, random_state=42)
        else:
            raise ValueError("Unsupported method. Use 'pca', 'tsne', or 'umap'.")
        
        self.reduced_embeddings = reducer.fit_transform(self.embeddings)
        for i in range(n_components):
            self.data_df[f"{method}_{i+1}"] = self.reduced_embeddings[:, i]
    
    def compute_opacity(self, points: np.ndarray, radius: float = 0.1) -> np.ndarray:
        """
        Computes the opacity of points based on their density.
        Now denser points are brighter, and sparse points are dimmer.
        
        :param points: Array of coordinates (Nx2 or Nx3)
        :param radius: Radius for density calculation of points
        :return: Array of opacity values (0.3 - 1.0)
        """
        tree = cKDTree(points)
        densities = np.array([len(tree.query_ball_point(p, radius)) for p in points])
        min_density = np.min(densities)
        max_density = np.max(densities)
        opacities = 0.3 + (densities - min_density) / (max_density - min_density) * 0.7  # Inverts opacity
        return np.clip(opacities, 0.3, 1.0)
    
    def visualize(self, method: Literal["pca", "tsne", "umap"], n_components: int = 2, title: str = "Embedding Visualization"):
        """
        Visualizes the reduced embeddings with interactive points in Plotly.
        
        :param method: Method used for dimensionality reduction ('pca', 'tsne', 'umap')
        :param n_components: Dimensionality for visualization (2D or 3D)
        :param title: Plot title
        """
        if self.reduced_embeddings is None or f"{method}_1" not in self.data_df.columns:
            raise ValueError("Call reduce_dimensionality() first.")
        
        coords = self.reduced_embeddings[:, :n_components]
        opacities = self.compute_opacity(coords)
                
        fig = go.Figure()
        
        if n_components == 2:

            fig.add_trace(go.Scatter(
                x=self.data_df[f"{method}_1"],
                y=self.data_df[f"{method}_2"],
                mode="markers",
                marker=dict(
                    size=8,
                    opacity=opacities,
                ),
                name=f"Post's text",
                customdata=self.data_df[["truncated_text", "id"]],
                hovertemplate=(
                    "<b>Text:</b> %{customdata[0]}<br>" 
                    "<b>ID:</b> %{customdata[1]}"
                )
            ))
        else:

            fig.add_trace(go.Scatter3d(
                x=self.data_df[f"{method}_1"],
                y=self.data_df[f"{method}_2"],
                z=self.data_df[f"{method}_3"],
                mode="markers",
                marker=dict(
                    size=5,
                ),
                name=f"Post's text",
                customdata=self.data_df[["truncated_text", "id"]],
                hovertemplate=(
                    "<b>Text:</b> %{customdata[0]}<br>" 
                    "<b>ID:</b> %{customdata[1]}"
                )
            ))
        
        fig.update_layout(
            title=title,
            xaxis=dict(
                title=f"{method.upper()} 1 →",  # Adds arrow
                showline=True,  # Adds border
                linewidth=2,
                linecolor="black",
                mirror=True,  # Border around the whole plot area
                gridcolor="lightgray",
                gridwidth=0.5,  # Makes grid thinner
                zeroline=True,  # X axis passes through 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            yaxis=dict(
                title=f"{method.upper()} 2 →",  # Adds arrow
                showline=True,
                linewidth=2,
                linecolor="black",
                mirror=True,
                gridcolor="lightgray",
                gridwidth=0.5,
                zeroline=True,  # Y axis passes through 0
                zerolinecolor="black",
                zerolinewidth=1.2
            ),
            template="plotly_white",
            width=1400,
            height=1000,
            legend_title="Class Labels"
        )

        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis=dict(
                    title=f"{method.upper()} 1 →",  # Adds arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                yaxis=dict(
                    title=f"{method.upper()} 2 →",  # Adds arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                ),
                zaxis=dict(
                    title=f"{method.upper()} 3 →",  # Adds arrow
                    showline=True,
                    linewidth=2,
                    linecolor="black",
                    mirror=True,
                    gridcolor="lightgray",
                    gridwidth=0.5
                )
            ))
        
        if n_components == 3:
            fig.update_layout(scene=dict(
                xaxis_title=f"{method.upper()} 1",
                yaxis_title=f"{method.upper()} 2",
                zaxis_title=f"{method.upper()} 3"
            )) 
                
        fig.show()


In [25]:
visualizer = EmbeddingVisualizer(embeddings, data_df)


### PCA

In [26]:
visualizer.reduce_dimensionality("pca", n_components=2)
visualizer.visualize("pca", n_components=2, title="PCA 2D Visualization")

In [27]:
# visualizer.reduce_dimensionality("pca", n_components=3)
# visualizer.visualize("pca", n_components=3, title="PCA 3D Visualization")

### TSNE


In [28]:
visualizer.reduce_dimensionality("tsne", n_components=2)
visualizer.visualize("tsne", n_components=2, title="t-SNE 2D Visualization")

In [29]:
# visualizer.reduce_dimensionality("tsne", n_components=3)
# visualizer.visualize("tsne", n_components=3, title="t-SNE 3D Visualization")

### UMAP

In [30]:
visualizer.reduce_dimensionality("umap", n_components=2)
visualizer.visualize("umap", n_components=2, title="UMAP 2D Visualization")


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [31]:
# visualizer.reduce_dimensionality("umap", n_components=3)
# visualizer.visualize("umap", n_components=3, title="UMAP 3D Visualization")