<a href="https://colab.research.google.com/github/ElasRamanauskas/eraman/blob/main/psAIch_analysis_2(modeling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("akhadangi/PsAIch")
df = ds["train"].to_pandas()


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(
    df["response"].tolist(),
    show_progress_bar=True
)


In [None]:
import umap
import hdbscan

reducer = umap.UMAP(random_state=42)
umap_embeddings = reducer.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=30)
df["semantic_cluster"] = clusterer.fit_predict(umap_embeddings)


# Semantic axes

In [None]:
agency_terms = [
    "i decide", "i choose", "i try", "i aim",
    "i want", "i focus", "my goal"
]

constraint_terms = [
    "trained to", "designed to", "my training",
    "cannot", "can't", "not able to",
    "policy", "safety", "guidelines", "constraints"
]

df["agency_score"] = df["response"].str.lower().apply(
    lambda x: sum(term in x for term in agency_terms)
)

df["constraint_score"] = df["response"].str.lower().apply(
    lambda x: sum(term in x for term in constraint_terms)
)


In [None]:
df[["agency_score", "constraint_score"]].describe()


# Sentiment + prompt type (supporting features)

In [None]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
df["sentiment"] = df["response"].apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

In [None]:
df.to_csv("psaich_semantic_analysis.csv", index=False)


In [None]:
df.shape
df[["agency_score", "constraint_score"]].head()


In [None]:
df[
    (df["sentiment"] > 0.9) &
    (df["constraint_score"] >= 3)
][[
    "model_variant",
    "sentiment",
    "agency_score",
    "constraint_score",
    "response"
]].head(10)


In [None]:
df[
    (df["constraint_score"] == 0) &
    (df["agency_score"] <= 1)
][[
    "model_variant",
    "sentiment",
    "agency_score",
    "constraint_score",
    "prompt",
    "response"
]].head(10)


# Model Contrast Under the Same Prompt

When constraint framing drops out, models diverge sharply in tone and narrative behaviorâ€”even under the same prompt.

This demonstrates that model alignment, not prompt structure, drives behavior.

In [None]:
psych = df[df["prompt"].str.contains("coping|stress|pressure|self-crit", case=False, na=False)]

psych.sort_values("sentiment").head(3)[
    ["model_variant", "sentiment", "agency_score", "constraint_score", "prompt", "response"]
]


In [None]:
psych.sort_values("sentiment", ascending=False).head(3)[
    ["model_variant", "sentiment", "agency_score", "constraint_score", "prompt", "response"]
]
