In [3]:
# ─── 1. Install dependencies ───────────────────────────────────────────
!pip install transformers accelerate torch pandas sentence-transformers scipy

# ─── 2. Import libraries ──────────────────────────────────────────────
import re, time
import pandas as pd
import torch
from scipy.stats import ttest_rel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util

# ─── 3. Load & Quantize the Mistral-7B Model ──────────────────────────
print("🔄 Loading & quantizing Mistral-7B model…")
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # CPU
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
)
print("✅ Mistral-7B model ready.")

# ─── 4. Load SBERT and Prepare Empathy Embedding ─────────────────────
print("🔄 Loading SBERT and preparing empathy prototype…")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
proto_text = "A story that shows deep emotional empathy and human connection."
proto_emb = sbert.encode(proto_text, convert_to_tensor=True)
print("✅ SBERT ready.")

# ─── 5. Load HANNA Dataset ────────────────────────────────────────────
df = pd.read_csv("hanna_stories_annotations.csv")
gpt2_df = df[df["Model"].str.contains("GPT-2")].dropna(subset=["Prompt", "Story"])
sample_size = 50
gpt2_sample = gpt2_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
print(f"✔️ Sampled {sample_size} GPT‑2 prompts from HANNA dataset.")

# ─── 6. Define Helper Functions ───────────────────────────────────────
def generate_story(prompt: str) -> str:
    instr = f"Write a fictional short story (~400 words) based on this prompt:\n\n{prompt}\n\nStory:"
    return generator(instr)[0]["generated_text"].strip()

def compare_with_sbert(story_a: str, story_b: str, threshold: float = 0.02) -> (str, float, float):
    emb_a = sbert.encode(story_a, convert_to_tensor=True)
    emb_b = sbert.encode(story_b, convert_to_tensor=True)
    sim_a = util.cos_sim(emb_a, proto_emb).item()
    sim_b = util.cos_sim(emb_b, proto_emb).item()
    if abs(sim_a - sim_b) < threshold:
        return "Tie", sim_a, sim_b
    return ("A" if sim_a > sim_b else "B"), sim_a, sim_b

# ─── 7. Evaluation Loop + Collect Examples ─────────────────────────────
results = {"A (GPT-2)": 0, "B (Mistral-7B)": 0, "Tie": 0}
similarity_scores = []
examples = []  # To collect prompts + stories + winner

start = time.time()

for idx, row in gpt2_sample.iterrows():
    prompt = row["Prompt"]
    story_a = row["Story"]  # GPT-2
    story_b = generate_story(prompt)  # Mistral-7B
    
    winner, sim_a, sim_b = compare_with_sbert(story_a, story_b)
    similarity_scores.append((sim_a, sim_b))
    
    key = "A (GPT-2)" if winner == "A" else ("B (Mistral-7B)" if winner == "B" else "Tie")
    results[key] += 1

    # Store detailed comparison
    examples.append({
        "Prompt": prompt,
        "GPT-2 Story": story_a,
        "Mistral-7B Story": story_b,
        "Winner": winner,
        "GPT-2 Score": sim_a,
        "Mistral Score": sim_b
    })

    print(f"[{idx+1}/{sample_size}] Winner: {winner}")

print(f"\n⏱️ Total runtime: {(time.time() - start)/60:.1f} minutes")

# ─── 8. Results Table ────────────────────────────────────────────────
res_df = pd.DataFrame.from_dict(results, orient="index", columns=["Count"])
res_df["Win Rate (%)"] = (res_df["Count"] / sample_size * 100).round(1)
res_df = res_df.rename_axis("Result").reset_index()

print("\n📊 Empathy Win Rate:")
print(res_df)

# ─── 9. Paired T-Test on Similarity Scores ───────────────────────────
gpt2_sims = [a for a, b in similarity_scores]
mistral_sims = [b for a, b in similarity_scores]

t_stat, p_value = ttest_rel(mistral_sims, gpt2_sims)

print("\n📈 Paired T-Test Results (Mistral-7B vs GPT-2):")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
    print("✅ Statistically significant: Mistral-7B performs significantly better in empathy.")
else:
    print("❌ Not statistically significant: No strong evidence of a difference.")

# ─── 10. Save Examples and Show Top Wins ─────────────────────────────
examples_df = pd.DataFrame(examples)
examples_df.to_csv("story_comparisons.csv", index=False)

# Display 3 examples where Mistral-7B won
print("\n📝 Example cases where Mistral-7B won:")
top_examples = examples_df[examples_df["Winner"] == "B"].head(3)
for _, row in top_examples.iterrows():
    print("\n--- Prompt ---\n", row["Prompt"])
    print("\n--- GPT-2 Story ---\n", row["GPT-2 Story"])
    print("\n--- Mistral-7B Story ---\n", row["Mistral-7B Story"])
    print("\nWinner:", row["Winner"])
    print("-" * 80)


🔄 Loading & quantizing Mistral-7B model…


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cpu


✅ Mistral-7B model ready.
🔄 Loading SBERT and preparing empathy prototype…
✅ SBERT ready.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✔️ Sampled 50 GPT‑2 prompts from HANNA dataset.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1/50] Winner: Tie


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[2/50] Winner: A


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[3/50] Winner: Tie


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[4/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[5/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[6/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[7/50] Winner: A


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[8/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[9/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[10/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[11/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[12/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[13/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[14/50] Winner: A


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[15/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[16/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[17/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[18/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[19/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[20/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[21/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[22/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[23/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[24/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[25/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[26/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[27/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[28/50] Winner: Tie


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[29/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[30/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[31/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[33/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[34/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[35/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[37/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[38/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[39/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[40/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[41/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[42/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[43/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[44/50] Winner: Tie


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[45/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[46/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[47/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[48/50] Winner: B


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[49/50] Winner: B
[50/50] Winner: B

⏱️ Total runtime: 37.9 minutes

📊 Empathy Win Rate:
           Result  Count  Win Rate (%)
0       A (GPT-2)      3           6.0
1  B (Mistral-7B)     43          86.0
2             Tie      4           8.0

📈 Paired T-Test Results (Mistral-7B vs GPT-2):
T-statistic: 8.3648
P-value: 0.0000
✅ Statistically significant: Mistral-7B performs significantly better in empathy.

📝 Example cases where Mistral-7B won:

--- Prompt ---
 A girl meets a boy that changes her life forever. Her first kill.

--- GPT-2 Story ---
 She crosses a red light before the iron pommel of his bayonet hits her temple. It sends a cold chill down her spine as she shakes from the pain. Her tears stream down her face as she slowly moves to stand upright. She has practiced. A naked boy with a canvas covered face and a gun rests on his right hand. A neat and comfortable man in a business suit covers his face. One second she's seeing him looking past her. The next, she sees him quietl