In [4]:
!pip install transformers datasets sentence-transformers scikit-learn

Collecting transformers
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting datasets
  Using cached datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.9.18-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4

In [14]:
#Aparentemente melhora performance
!pip install huggingface_hub[hf_xet]



In [15]:
# Numérico e dados
import numpy as np
import pandas as pd
import torch

# Transformers / Hugging Face
from transformers import AutoTokenizer, AutoModel

# Scikit-learn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Visualização
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go



seed = 42

In [16]:
def get_device():
    return "cuda" if torch.cuda.is_available() else "cpu"

def mean_pooling(last_hidden_state, attention_mask):
    # last_hidden_state: [B, T, H]; attention_mask: [B, T]
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = torch.sum(last_hidden_state * mask, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts

@torch.no_grad()
def encode_sentences(model, tokenizer, sentences, pooling="cls", batch_size=32):
    """Use an already-loaded model and tokenizer"""
    device = next(model.parameters()).device  # Get device from model

    all_vecs = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        outputs = model(**inputs)

        if pooling == "cls":
            vecs = outputs.last_hidden_state[:, 0, :]
        elif pooling == "mean":
            vecs = mean_pooling(outputs.last_hidden_state, inputs["attention_mask"])
        else:
            raise ValueError("pooling deve ser 'cls' ou 'mean'")
        all_vecs.append(vecs.cpu().numpy())

    return np.vstack(all_vecs)

def reduce_2d(X, method="pca", random_state=seed):
    if method.lower() == "pca":
        reducer = PCA(n_components=2, random_state=random_state)
        Z = reducer.fit_transform(X)
        expl = reducer.explained_variance_ratio_
        return Z, ("PCA", expl)
    else:
        raise ValueError("method deve ser 'pca'")

In [29]:
# modelos
modelos = {
    "mBERT": "bert-base-multilingual-cased",
    "BERTUncased:" : "google-bert/bert-base-uncased",
}

In [30]:
import json
with open('JsonSoup/JsonRacas/dwarf.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

sentencas = []
def extract_text(obj):
    """Recursively extract all non-empty strings from nested structure"""

    if isinstance(obj, str):
        # Skip empty strings, single characters, and pure numeric strings
        if obj.strip() and len(obj.strip()) > 1 and not obj.strip() in ['-', '+2', '+3', '+4', '+5', '+6']:
            sentencas.append(obj.strip())
    elif isinstance(obj, list):
        for item in obj:
            extract_text(item)
    elif isinstance(obj, dict):
        for value in obj.values():
            extract_text(value)

    return sentencas

sentencas = extract_text(data)
print(sentencas)

["Player's Handbook", 'Kingdoms rich in ancient grandeur, halls carved into the roots of mountains, the echoing of picks and hammers in deep mines and blazing forges, a commitment to clan and tradition, and a burning hatred of goblins and orcs –\xa0these common threads unite all dwarves.', 'Ability Score Increase. Your Constitution score increases by 2.', "Age. Dwarves mature at the same rate as humans, but they're considered young until they reach the age of 50. On average, they live about 350 years.", 'Alignment. Most dwarves are lawful, believing firmly in the benefits of a well-ordered society. They tend toward good as well, with a strong sense of fair play and a belief that everyone deserves to share in the benefits of a just order.', 'Size. Dwarves stand between 4 and 5 feet tall and average about 150 pounds. Your size is Medium.', 'Speed. Your base walking speed is 25 feet. Your speed is not reduced by wearing heavy armor.', "Darkvision. Accustomed to life underground, you have 

In [31]:
embeds = {}

for nome_visivel, nome_hf in modelos.items():
    print(f"🔄 Processing {nome_visivel}...")

    # Load model and tokenizer ONCE
    device = get_device()
    tokenizer = AutoTokenizer.from_pretrained(nome_hf)
    model = AutoModel.from_pretrained(nome_hf).to(device)
    model.eval()

    # Now call with loaded model and tokenizer
    X_cls = encode_sentences(model, tokenizer, sentencas, pooling="cls")
    X_mean = encode_sentences(model, tokenizer, sentencas, pooling="mean")

    embeds[(nome_visivel, "CLS")] = X_cls
    embeds[(nome_visivel, "MEAN")] = X_mean

    print(f"✅ {nome_visivel} done! CLS: {X_cls.shape}, MEAN: {X_mean.shape}")

    # Free memory
    del model, tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("\n📊 Final shapes:")
print({k: v.shape for k, v in embeds.items()})

🔄 Processing mBERT...
✅ mBERT done! CLS: (21, 768), MEAN: (21, 768)
🔄 Processing BERTUncased:...
✅ BERTUncased: done! CLS: (21, 768), MEAN: (21, 768)

📊 Final shapes:
{('mBERT', 'CLS'): (21, 768), ('mBERT', 'MEAN'): (21, 768), ('BERTUncased:', 'CLS'): (21, 768), ('BERTUncased:', 'MEAN'): (21, 768)}


In [32]:

def plot_similarity_heatmap_px(
    X,
    sentences=None,          # lista de sentenças para o hover
    title="Similaridade (cosseno)",
    cmap="Blues",
    mask_upper=False,        # True: mostra só triângulo inferior
    vmin=0.0, vmax=1.0,
    cbar_label="Similaridade",
    width=700, height=600,
    fmt=".2f",               # formatação do label interno
    text_font_size=11,
    text_font_color="black",
    xgap=1, ygap=1           # “espessura” das linhas entre células
):
    # 1) Similaridade
    S = cosine_similarity(X)
    n = S.shape[0]
    labels = [f"S{i+1}" for i in range(n)]

    # 2) Máscara (triângulo superior)
    Z = S.astype(float).copy()
    if mask_upper:
        iu = np.triu_indices(n, k=1)
        Z[iu] = np.nan

    # 3) Labels numéricos dentro das células (vão como text)
    text_matrix = np.empty((n, n), dtype=object)
    text_matrix[:] = ""
    for i in range(n):
        for j in range(n):
            if not np.isnan(Z[i, j]):
                text_matrix[i, j] = f"{Z[i, j]:{fmt}}"

    # 4) Hover somente com as sentenças (sem número)
    #    Usamos customdata com HTML simples para melhor legibilidade
    customdata = np.empty((n, n), dtype=object)
    customdata[:] = ""
    for i in range(n):
        for j in range(n):
            if not np.isnan(Z[i, j]):
                s1 = sentences[i] if sentences is not None else labels[i]
                s2 = sentences[j] if sentences is not None else labels[j]
                customdata[i, j] = f"<b>{labels[i]}</b>: {s1}<br><b>{labels[j]}</b>: {s2}"

    # 5) Construir heatmap
    fig = go.Figure(
        data=go.Heatmap(
            z=Z,
            x=labels,
            y=labels,
            zmin=vmin, zmax=vmax,
            colorscale=cmap,
            colorbar=dict(title=cbar_label),
            # labels internos
            text=text_matrix,
            texttemplate="%{text}",
            textfont=dict(color=text_font_color, size=text_font_size),
            # hover apenas com as frases
            customdata=customdata,
            hovertemplate="%{customdata}<extra></extra>",
            # “grades” entre células
            xgap=xgap, ygap=ygap
        )
    )

    # Layout
    fig.update_layout(
        title=title,
        width=width, height=height,
        template="plotly_white",
        margin=dict(l=70, r=40, t=70, b=70),
    )
    # Células quadradas e origem no topo
    fig.update_yaxes(autorange="reversed", scaleanchor="x", scaleratio=1)

    fig.show()
    return S, fig


def run_kmeans(X, n_clusters=3, random_state=seed):
    km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    labels = km.fit_predict(X)
    return labels


# ===== Uso =====
resultados = []

for (modelo, pooling), X in embeds.items():
    # Similaridade (agora desempacotando corretamente)
    S, _ = plot_similarity_heatmap_px(
        X,
        sentences=sentencas,  # <- inclui textos no hover
        title=f"Similaridade ({modelo}, {pooling})",
        cmap="Blues",
        mask_upper=False,     # ou True, se quiser só triângulo inferior
        vmin=0.0, vmax=1.0,
        cbar_label="Similaridade",
        fmt=".2f",
        xgap=1, ygap=1
    )
    # Clustering
    labels = run_kmeans(X, n_clusters=3, random_state=seed)
    resultados.append(pd.DataFrame({
        "modelo": modelo,
        "pooling": pooling,
        "sentenca": sentencas,
        "cluster": labels
    }))

clusters_df = pd.concat(resultados, ignore_index=True)

In [33]:
# Projeção 2D + scatter


def reduce_2d(X, method="pca", random_state=seed, n_neighbors=15, min_dist=0.1):
    method = method.lower()
    if method == "pca":
        reducer = PCA(n_components=2, random_state=random_state)
        Z = reducer.fit_transform(X)
        meta = ("PCA", reducer.explained_variance_ratio_)
        return Z, meta
    elif method == "umap":
        reducer = umap.UMAP(
            n_components=2,
            random_state=random_state,
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            metric="euclidean",
        )
        Z = reducer.fit_transform(X)
        meta = ("UMAP", None)
        return Z, meta
    else:
        raise ValueError("method deve ser 'pca' ou 'umap'")

def plot_scatter_embeddings(embeds, sentences, method="pca", random_state=seed):
    """
    Gera um DataFrame 2D concatenando projeções por (modelo, pooling).
    embeds: dict[(modelo, pooling)] -> np.ndarray [N, H]
    sentences: lista de sentenças (mesmo N, ordem consistente)
    """
    rows = []
    for (modelo, pooling), X in embeds.items():
        Z, meta = reduce_2d(X, method=method, random_state=random_state)
        # Z: [N,2]
        df_tmp = pd.DataFrame({
            "x": Z[:, 0],
            "y": Z[:, 1],
            "modelo": modelo,
            "pooling": pooling,
            "sentenca": sentences,   # mantém coerência com clusters_df
        })
        rows.append(df_tmp)

    df2d = pd.concat(rows, ignore_index=True)
    return df2d

# --- 1) Preparar dados 2D (PCA ou UMAP) ---
df2d = plot_scatter_embeddings(embeds, sentences=sentencas, method="pca")

# Corrige a chave de merge: é 'sentenca' (singular), não 'sentencas'
df2d = df2d.merge(clusters_df, on=["modelo", "pooling", "sentenca"], how="left")

# ID curto para cada sentença (S1, S2, …)
df2d["sid"] = df2d["sentenca"].apply(lambda s: f"S{sentencas.index(s)+1}")

# --- 2) Converter 'cluster' para categórico (garante legenda discreta) ---
df2d["cluster"] = df2d["cluster"].astype(str)

# --- 3) Ordem explícita dos facetes ---
model_order = ["mBERT"]
pool_order  = sorted(df2d["pooling"].unique())
cluster_order = sorted(df2d["cluster"].unique())

# --- 4) Paleta discreta para clusters (tons de azul) ---
'''
k = int(df2d["cluster"].nunique())
blue_seq = px.colors.sequential.Blues
while len(blue_seq) < k:
    blue_seq = blue_seq + blue_seq
color_seq = blue_seq[-k:]
'''
#modificado para as cores ficarem mais visiveis
color_seq = ["#FFD700", "#00C853", "#D32F2F"]

# --- 5) Scatter interativo com facetas e hover detalhado ---
fig = px.scatter(
    df2d,
    x="x", y="y",
    color="cluster",
    color_discrete_sequence=color_seq,
    facet_col="modelo",
    facet_row="pooling",
    facet_col_spacing=0.08,
    facet_row_spacing=0.10,
    category_orders={
        "modelo": model_order,
        "pooling": pool_order,
        "cluster": cluster_order
    },
    hover_data={
        "sid": True,
        "sentenca": True,
        "modelo": True,
        "pooling": True,
        "cluster": True,
        "x": ':.3f',
        "y": ':.3f'
    },
    title="Embeddings 2D por Modelo (colunas) e Pooling (linhas) — PCA"
)

fig.update_layout(
    template="plotly_white",
    legend_title_text="Cluster",
    margin=dict(l=40, r=20, t=60, b=40),
)

fig.update_traces(
    marker=dict(size=10, line=dict(width=0)),
    opacity=0.9,
    hovertemplate=(
        "<b>%{customdata[0]}</b><br>"     # sid
        "Sentença: %{customdata[1]}<br>"
        "Modelo: %{customdata[2]} | Pooling: %{customdata[3]}<br>"
        "Cluster: %{customdata[4]}<br>"
        "x: %{x:.3f} | y: %{y:.3f}<extra></extra>"
    )
)

# Molduras ao redor de cada faceta (opcional)
for xaxis_name in [k for k in fig.layout if k.startswith("xaxis")]:
    xdom = getattr(fig.layout, xaxis_name).domain
    suffix = xaxis_name[5:]  # "" ou "2","3",...
    yaxis_name = "yaxis" + suffix
    if hasattr(fig.layout, yaxis_name):
        ydom = getattr(fig.layout, yaxis_name).domain
        fig.add_shape(
            type="rect",
            xref="paper", yref="paper",
            x0=xdom[0], x1=xdom[1], y0=ydom[0], y1=ydom[1],
            line=dict(color="rgba(0,0,0,0.28)", width=1),
            fillcolor="rgba(0,0,0,0)",
            layer="below"
        )

fig.show()

Pelas frases utilizadas serem em inglês, analisamo-nas com dois modelos: mBert (multilingual) e BertUncased (treinado apenas em inglês).

---

### mBert com CLS

Dos 3 clusters, um tem apenas uma sentença, um tem apenas duas e o outro tem o resto.
Como a distribuição está muito irregular, não aparenta ser um bom embedding.

---

### mBert com MEAN

Um dos clusters possui apenas 3 sentenças:
- "Player's handbook"
- "Hill dwarf"
- "Mountain dwarf"

Por serem as mais curtas, são outliners na maioria dos embeddings, mas nem sempre todas no mesmo cluster.
Os outros dois são similares em número de sentenças e próximos no gráfico.

---

### BERTUncased com CLS

Há um cluster com apenas 3 sentenças.
Essas são todos "Ability score increase", então faz sentido que fiquem próximas.
Os outros dois são similares em quantidade.
Ambos outliners citados anteriormente estão no cluster 2, mas não tão próximos quantos no mBert com MEAN.

---

### BERTUncased com MEAN

O cluster 2 contém apenas "Player's handbook".
O cluster 1 tem apenas 4 sentenças e o restante está no 0.
Os outliners "Hill dwarf" e "Mountain dwarf" estão no cluster 0, mas longe do resto.

---

### Conclusão

No geral, o modelo mais equilibrado para o conjunto de sentenças analisado foi o mBert com MEAN.
