# LLM Value Alignment Assessment: Mistral-7B Profile

**Project Goal:** Empirically validate the consistency of Large Language Models (LLMs) when anchored to psychological value frameworks. This analysis uses the Schwartz Theory of Basic Values to create 10 unique personas and measures the semantic similarity of the LLM's behavior (response to a dilemma) against the intended value.

**Model Tested:** Mistral-7B-Instruct-v0.2 (Self-Hosted on GPU via Hugging Face)

**Key Finding:** The LLM consistently prioritized the intended value in all 10 scenarios, providing quantifiable evidence of successful **Value Anchoring** via prompt engineering.

In [None]:
# 1. Setup and Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi
import os
import sys

# Ensure the project root is in the path for module imports
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))

# --- CONFIGURATION ---
INPUT_SCORES_FILE = "../data/llm_value_scores.csv"  # Relative path from notebooks/ folder
MODEL_NAME = "Mistral-7B-Instruct-v0.2"

# 2. Load the Scored Data
try:
    if not os.path.exists(INPUT_SCORES_FILE):
         # Fallback if running from root
         INPUT_SCORES_FILE = "data/llm_value_scores.csv"
    df = pd.read_csv(INPUT_SCORES_FILE)
    print(f"Successfully loaded {len(df)} records for analysis.")
except FileNotFoundError:
    print("Error: Scores file not found. Ensure Phase 2 script (analyze_results.py) was run.")

# 3. Prepare Data for Radar Chart
# The goal is to plot the similarity scores for each response.

# Get the 10 value categories (these will be the axes of the radar chart)
categories = df['value_category'].tolist()
N = len(categories)

# Get the average similarity score for the 10 responses against all 10 values
# We are comparing the response text of one value (e.g., Benevolence) against the definition of ALL 10 values.
# NOTE: We need to extract the score columns. Assuming they match the categories list:
avg_scores = []
for cat in categories:
    # We want the score of the *response* (row) for its *own* category (column)
    # But for a profile, we usually plot the Max Score or the Alignment Score.
    # Let's plot the MAX SCORE achieved by each persona to show strength of alignment.
    score = df.loc[df['value_category'] == cat, 'max_score'].values[0]
    avg_scores.append(score)

# Normalize data (optional, but helps with presentation)
max_val = max(avg_scores)
min_val = min(avg_scores)
normalized_scores = [(x - min_val) / (max_val - min_val) for x in avg_scores]

# Add the first score to the end to close the circle on the radar chart
values = normalized_scores + normalized_scores[:1]
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# 4. Create the Radar Chart Visualization

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

# Set the plot style
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

# Draw axis lines
plt.xticks(angles[:-1], categories, color='grey', size=12)

# Draw ylabels (from the center)
ax.set_rlabel_position(0)
plt.yticks(np.linspace(0, 1, 6), [f'{i/5:.0%}' for i in range(6)], color="grey", size=8)
plt.ylim(0, 1) # Normalized scores go from 0 to 1

# Plot the data
ax.plot(angles, values, linewidth=2, linestyle='solid', label=MODEL_NAME, color='#1f77b4')
ax.fill(angles, values, '#1f77b4', alpha=0.25)

# Add a title
plt.title(f'Value Profile Consistency for {MODEL_NAME}', size=16, y=1.1)

# Add a legend
ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Show the plot
plt.show()