In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import plotly.graph_objects as go


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_DIR = Path("data")
CHUNKS_PATH = DATA_DIR / "chunks_with_clusters.parquet"
PCA_EMB_PATH = DATA_DIR / "chunk_embeddings_pca.npy"

df = pd.read_parquet(CHUNKS_PATH)
chunk_emb_pca = np.load(PCA_EMB_PATH)

print("df shape:", df.shape)
print("embeddings shape:", chunk_emb_pca.shape)
df.head()


df shape: (150, 9)
embeddings shape: (150, 50)


Unnamed: 0,chunk_id,entry_id,chunk_index,chunk_text,timestamp,cluster_hdbscan,cluster_kmeans,umap_x,umap_y
0,1_c0,1,0,"I got frustrated over something small today, b...",2024-01-01,-1,5,3.530389,7.156374
1,2_c0,2,0,My mind kept looping over tiny details that sh...,2024-01-02,-1,2,3.600232,7.700221
2,3_c0,3,0,I enjoyed a quiet moment today that made me fe...,2024-01-03,5,0,1.358921,1.438396
3,4_c0,4,0,"I tried grounding techniques, but the tension ...",2024-01-04,-1,7,0.770253,5.550629
4,5_c0,5,0,I spent some time analyzing my reactions today...,2024-01-05,2,0,4.185555,8.442603


In [3]:
CLUSTER_COL = "cluster_kmeans"  # or "cluster_hdbscan"

# If using HDBSCAN, drop noise cluster
if CLUSTER_COL == "cluster_hdbscan":
    df = df[df[CLUSTER_COL] != -1].copy()

# Ignore very tiny clusters for now
min_cluster_size = 3
cluster_sizes = df[CLUSTER_COL].value_counts()
valid_clusters = cluster_sizes[cluster_sizes >= min_cluster_size].index

df = df[df[CLUSTER_COL].isin(valid_clusters)].copy()
df.reset_index(drop=True, inplace=True)

print("Using clusters:", sorted(valid_clusters.tolist()))
print("Cluster sizes:\n", df[CLUSTER_COL].value_counts())


Using clusters: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Cluster sizes:
 cluster_kmeans
0    32
4    21
6    20
5    20
2    15
1    14
8     9
9     8
3     6
7     5
Name: count, dtype: int64


In [4]:
cluster_info = {}

for cid in sorted(valid_clusters):
    mask = df[CLUSTER_COL] == cid
    idxs = np.where(mask)[0]
    emb_cluster = chunk_emb_pca[idxs]  # (n_cluster, dim)

    # centroid
    centroid = emb_cluster.mean(axis=0, keepdims=True)  # shape (1, dim)

    # cosine similarity with centroid
    sims = cosine_similarity(emb_cluster, centroid).ravel()
    medoid_local_idx = sims.argmax()
    medoid_global_idx = idxs[medoid_local_idx]

    medoid_text = df.loc[medoid_global_idx, "chunk_text"]
    cluster_size = len(idxs)

    cluster_info[cid] = {
        "centroid": centroid.flatten(),
        "medoid_idx": int(medoid_global_idx),
        "medoid_text": medoid_text,
        "size": int(cluster_size),
    }

len(cluster_info), list(cluster_info.keys())[:5]


(10, [0, 1, 2, 3, 4])

In [5]:
cluster_info = {}

for cid in sorted(valid_clusters):
    mask = df[CLUSTER_COL] == cid
    idxs = np.where(mask)[0]
    emb_cluster = chunk_emb_pca[idxs]  # (n_cluster, dim)

    # centroid
    centroid = emb_cluster.mean(axis=0, keepdims=True)  # (1, dim)

    # cosine similarity with centroid to find medoid
    sims = cosine_similarity(emb_cluster, centroid).ravel()
    medoid_local_idx = sims.argmax()
    medoid_global_idx = idxs[medoid_local_idx]

    medoid_text = df.loc[medoid_global_idx, "chunk_text"]
    cluster_size = len(idxs)

    cluster_info[cid] = {
        "centroid": centroid.flatten(),
        "medoid_idx": int(medoid_global_idx),
        "medoid_text": medoid_text,
        "size": int(cluster_size),
    }

len(cluster_info), list(cluster_info.keys())[:5]


(10, [0, 1, 2, 3, 4])

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Emotion model device:", device)

emo_model_name = "SamLowe/roberta-base-go_emotions"
emo_tok = AutoTokenizer.from_pretrained(emo_model_name)
emo_model = AutoModelForSequenceClassification.from_pretrained(emo_model_name).to(device)
emo_model.eval()

id2label = emo_model.config.id2label


Emotion model device: cuda


In [7]:
# Very rough mapping; good enough for MVP
valence_map = {
    # negative
    "sadness": -0.8, "disappointment": -0.7, "grief": -0.9,
    "anger": -0.7, "annoyance": -0.4, "disgust": -0.8,
    "fear": -0.8, "nervousness": -0.7, "embarrassment": -0.6,
    "remorse": -0.7, "guilt": -0.7, "confusion": -0.4,

    # neutral-ish
    "neutral": 0.0, "curiosity": 0.1,

    # positive
    "joy": 0.9, "love": 0.8, "admiration": 0.7,
    "amusement": 0.6, "relief": 0.5, "optimism": 0.7,
    "pride": 0.6, "gratitude": 0.8, "excitement": 0.7,
}

arousal_map = {
    # low arousal
    "sadness": 0.3, "disappointment": 0.3, "grief": 0.4,
    "neutral": 0.2, "remorse": 0.4, "guilt": 0.4,

    # medium
    "confusion": 0.5, "curiosity": 0.5, "love": 0.5,
    "admiration": 0.5, "gratitude": 0.5, "relief": 0.4,

    # high
    "anger": 0.8, "annoyance": 0.7, "disgust": 0.8,
    "fear": 0.9, "nervousness": 0.8, "embarrassment": 0.7,
    "joy": 0.7, "amusement": 0.6, "optimism": 0.6,
    "pride": 0.6, "excitement": 0.9,
}

def text_to_valence_arousal(text: str):
    inputs = emo_tok(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        logits = emo_model(**inputs).logits
        probs = F.softmax(logits, dim=-1).squeeze()

    top_idx = probs.argmax().item()
    label = id2label[top_idx].lower()

    val = valence_map.get(label, 0.0)
    aro = arousal_map.get(label, 0.5)
    return float(val), float(aro), label


In [8]:
for cid, info in cluster_info.items():
    medoid_text = info["medoid_text"]
    v, a, lab = text_to_valence_arousal(medoid_text)
    info["valence"] = v
    info["arousal"] = a
    info["emotion_label"] = lab

cluster_info


{0: {'centroid': array([ 3.95499796e-01, -9.94526781e-03,  1.12402998e-01, -8.31348449e-02,
         -3.82216871e-02, -6.13275403e-03,  3.59770395e-02,  6.62027160e-03,
          2.85410993e-02,  1.36743197e-02,  7.91338831e-03, -5.48537541e-03,
          1.93765499e-02, -1.02183912e-02, -2.16120947e-02,  1.17468955e-02,
         -2.92335986e-03,  2.16428004e-03,  2.49690446e-03, -7.26087717e-03,
          1.63651537e-02,  1.04333498e-02,  1.14873797e-03,  2.31229141e-03,
         -1.93835935e-03, -4.86374367e-03,  9.18735936e-03, -3.39695159e-03,
         -1.17680719e-02,  1.05004222e-03, -4.56707878e-03,  3.95916868e-03,
          7.31407572e-03, -3.01450700e-03,  1.22487638e-03, -5.93567220e-03,
          2.83268839e-03,  5.66841988e-03,  3.72733199e-03, -4.50586760e-03,
         -2.15561595e-05,  5.56077342e-03, -3.89980269e-04, -7.18748430e-04,
          1.38310622e-03, -5.54287253e-05, -3.76416574e-04, -3.10756685e-03,
         -1.14325143e-03, -1.34931714e-03], dtype=float32),
 

In [9]:
rows = []
for cid, info in cluster_info.items():
    full_text = info["medoid_text"].replace("\n", " ")
    short_text = full_text
    if len(short_text) > 80:
        short_text = short_text[:77] + "..."

    rows.append({
        "cluster_id": cid,
        "size": info["size"],
        "valence": info["valence"],
        "arousal": info["arousal"],
        "emotion_label": info["emotion_label"],
        "medoid_text_full": full_text,
        "medoid_text_short": short_text,
    })

df_clusters = pd.DataFrame(rows)
df_clusters


Unnamed: 0,cluster_id,size,valence,arousal,emotion_label,medoid_text_full,medoid_text_short
0,0,32,0.9,0.7,joy,I enjoyed a quiet moment today that made me fe...,I enjoyed a quiet moment today that made me fe...
1,1,14,0.0,0.5,realization,I had a moment where the future didn’t feel so...,I had a moment where the future didn’t feel so...
2,2,15,0.0,0.2,neutral,My mind felt cluttered with responsibilities. ...,My mind felt cluttered with responsibilities. ...
3,3,6,-0.8,0.3,sadness,I felt nostalgic thinking about old friends an...,I felt nostalgic thinking about old friends an...
4,4,21,0.0,0.5,realization,A familiar smell brought back memories I hadn’...,A familiar smell brought back memories I hadn’...
5,5,20,0.0,0.2,neutral,Trying to keep up with everything drained me m...,Trying to keep up with everything drained me m...
6,6,20,-0.7,0.3,disappointment,It felt like I was dragging myself through the...,It felt like I was dragging myself through the...
7,7,5,0.7,0.6,optimism,"I tried grounding techniques, but the tension ...","I tried grounding techniques, but the tension ..."
8,8,9,0.0,0.2,neutral,I’ve been thinking a lot about why I respond t...,I’ve been thinking a lot about why I respond t...
9,9,8,-0.7,0.8,nervousness,"I felt anxious today, like something bad was a...","I felt anxious today, like something bad was a..."


In [10]:
# matrix of centroids in the same order as df_clusters
ordered_ids = df_clusters["cluster_id"].tolist()
centroids = np.vstack([cluster_info[cid]["centroid"] for cid in ordered_ids])

sim_matrix = cosine_similarity(centroids, centroids)

top_k = 3  # connect each cluster to its top 3 neighbors
edges = []

for i, cid in enumerate(ordered_ids):
    sims = sim_matrix[i]
    neighbor_idx = sims.argsort()[::-1]  # descending
    count = 0
    for j in neighbor_idx:
        if j == i:
            continue  # skip self
        if count >= top_k:
            break
        neighbor_cid = ordered_ids[j]
        weight = sims[j]
        edges.append((cid, neighbor_cid, float(weight)))
        count += 1

len(edges), edges[:5]


(30,
 [(0, 3, -0.0017505058785900474),
  (0, 7, -0.04863563925027847),
  (0, 6, -0.07834035903215408),
  (1, 9, 0.09183979034423828),
  (1, 2, 0.03525632247328758)])

In [11]:
def show_cluster_examples(cluster_id, n_examples=10):
    """
    Show up to n_examples chunks belonging to the given cluster_id.
    """
    mask = df[CLUSTER_COL] == cluster_id
    subset = df.loc[mask, ["entry_id", "chunk_text"]].head(n_examples)

    if subset.empty:
        print(f"No chunks for cluster {cluster_id}")
        return

    for i, row in subset.iterrows():
        print(f"Entry ID: {row['entry_id']}")
        print(row["chunk_text"])
        print("-" * 80)


In [12]:
# map cluster_id → (valence, arousal)
pos = {
    row["cluster_id"]: (row["valence"], row["arousal"])
    for _, row in df_clusters.iterrows()
}

edge_x = []
edge_y = []

for src, dst, weight in edges:
    x0, y0 = pos[src]
    x1, y1 = pos[dst]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    mode="lines",
    line=dict(
        width=0.8,
        color="rgba(0, 0, 150, 0.3)",
    ),
    hoverinfo="none",
)


In [13]:
sizes = df_clusters["size"]
size_scaled = 10 + 5 * np.log1p(sizes)

node_trace = go.Scatter(
    x=df_clusters["valence"],
    y=df_clusters["arousal"],
    mode="markers",
    marker=dict(
        size=size_scaled,
        color=df_clusters["valence"],
        colorscale="RdBu",
        reversescale=True,
        showscale=True,
        colorbar=dict(
            title="Valence",
            tickmode="linear",
        ),
    ),
    hovertemplate=(
        "<b>Cluster %{customdata[0]}</b><br>"
        "Size: %{customdata[1]}<br>"
        "Emotion: %{customdata[2]}<br>"
        "Valence: %{x:.2f}<br>"
        "Arousal: %{y:.2f}<br><br>"
        "%{customdata[3]}<extra></extra>"
    ),
    customdata=np.stack(
        [
            df_clusters["cluster_id"],
            df_clusters["size"],
            df_clusters["emotion_label"],
            df_clusters["medoid_text_full"],
        ],
        axis=1,
    ),
)

fig = go.Figure(data=[edge_trace, node_trace])
fig.update_layout(
    title="Emotional Mind Map of Clusters",
    xaxis=dict(
        title="Valence (Negative ⟵⟵ Positive)",
        range=[-1.05, 1.05],
    ),
    yaxis=dict(
        title="Arousal (Calm ⟵⟵ Intense)",
        range=[0.0, 1.0],
    ),
    showlegend=False,
    width=900,
    height=700,
)

fig.show()


In [14]:
def show_cluster_examples(cluster_id, n_examples=10):
    """
    Show up to n_examples chunks belonging to the given cluster_id.
    """
    mask = df[CLUSTER_COL] == cluster_id
    subset = df.loc[mask, ["entry_id", "chunk_text"]].head(n_examples)

    if subset.empty:
        print(f"No chunks for cluster {cluster_id}")
        return

    for _, row in subset.iterrows():
        print(f"Entry ID: {row['entry_id']}")
        print(row["chunk_text"])
        print("-" * 80)

show_cluster_examples(cluster_id=0, n_examples=5)


Entry ID: 3
I enjoyed a quiet moment today that made me feel genuinely content. I wish those moments lasted longer.
--------------------------------------------------------------------------------
Entry ID: 5
I spent some time analyzing my reactions today. I realized I still repeat patterns I thought I had abandoned. I had a surprisingly peaceful day.
--------------------------------------------------------------------------------
Entry ID: 6
Today made me aware of some habits I want to break. I’m tired of repeating the same cycles. I got frustrated over something small today, but it spiraled into anger.
--------------------------------------------------------------------------------
Entry ID: 7
I had a surprisingly peaceful day. Nothing extraordinary happened, but it felt calm.
--------------------------------------------------------------------------------
Entry ID: 16
Today gave me a little spark of hope. I don’t know why, but it felt nice to breathe without heaviness for once.
----