# Part B: Sentiment Analysis Prompt Evaluation

## Objective
- Create sentiment analysis prompt (positive/negative/neutral)
- Include confidence score and reasoning
- Test on 10 emails
- Iterate and improve (v1 â†’ v2)

## 1. Setup

In [None]:
import pandas as pd
from groq import Groq
import os
from dotenv import load_dotenv
import json
from datetime import datetime

load_dotenv()
client = Groq(api_key=os.getenv('GROQ_API_KEY'))

## 2. Load Test Data (10 Emails)

In [None]:
# Load small dataset and select 10 emails
df = pd.read_csv('../data/small_dataset.csv')

# Select 10 emails for testing
test_emails = df.head(10).copy()

print(f"Test set: {len(test_emails)} emails")
print("\nSample emails:")
for idx, row in test_emails.iterrows():
    print(f"\nEmail {row['email_id']}:")
    print(f"Subject: {row['subject']}")
    print(f"Body: {row['body'][:100]}...")

## 3. Prompt v1: Initial Design

In [None]:
PROMPT_V1 = """Analyze the sentiment of this customer support email.

Subject: {subject}
Body: {body}

Classify the sentiment as: positive, negative, or neutral.

Provide your response in JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "reasoning": "brief explanation of why you chose this sentiment"
}}
"""

# Save prompt v1
with open('prompt_v1.txt', 'w') as f:
    f.write(PROMPT_V1)

print("Prompt v1:")
print(PROMPT_V1)

## 4. Test Prompt v1

In [None]:
def analyze_sentiment(subject, body, prompt_template, client):
    """
    Analyze sentiment using given prompt template.
    """
    prompt = prompt_template.format(subject=subject, body=body)
    
    try:
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=200
        )
        
        result = json.loads(response.choices[0].message.content)
        return result
    
    except Exception as e:
        return {"sentiment": "error", "confidence": 0.0, "reasoning": str(e)}

# Test on all 10 emails
results_v1 = []

for idx, row in test_emails.iterrows():
    result = analyze_sentiment(row['subject'], row['body'], PROMPT_V1, client)
    results_v1.append({
        'email_id': row['email_id'],
        'subject': row['subject'],
        'sentiment': result['sentiment'],
        'confidence': result['confidence'],
        'reasoning': result['reasoning']
    })
    print(f"Email {row['email_id']}: {result['sentiment']} (confidence: {result['confidence']})")

# Save results
results_v1_df = pd.DataFrame(results_v1)
results_v1_df.to_json('results_v1.json', orient='records', indent=2)

## 5. Manual Evaluation of v1 Results

In [None]:
# Display results for manual review
print("\nPrompt v1 Results:")
print("="*80)
for idx, result in enumerate(results_v1, 1):
    print(f"\nEmail {idx}:")
    print(f"Subject: {result['subject']}")
    print(f"Sentiment: {result['sentiment']}")
    print(f"Confidence: {result['confidence']}")
    print(f"Reasoning: {result['reasoning']}")
    print("-" * 80)

# TODO: Manually assess each prediction
# - Is the sentiment correct?
# - Is the confidence appropriate?
# - Is the reasoning sound?
# - What patterns of errors do you see?

## 6. Analyze Failures and Issues

In [None]:
# TODO: Document issues found in v1
# Examples of things to look for:
# - Incorrect sentiment classifications
# - Overconfident or underconfident predictions
# - Poor reasoning
# - Inconsistent handling of similar emails
# - Edge cases not handled well

print("\nIssues to address in v2:")
print("1. [Issue 1]")
print("2. [Issue 2]")
print("3. [Issue 3]")

## 7. Prompt v2: Improved Design

In [None]:
# TODO: Design improved prompt based on v1 failures
# Consider adding:
# - More specific instructions
# - Examples (few-shot learning)
# - Clearer definitions of positive/negative/neutral
# - Guidelines for confidence scoring
# - Context about customer support domain

PROMPT_V2 = """You are an expert at analyzing sentiment in customer support emails.

Email to analyze:
Subject: {subject}
Body: {body}

Instructions:
1. Classify the sentiment as:
   - "positive": Customer is happy, grateful, or satisfied
   - "negative": Customer is frustrated, angry, or disappointed
   - "neutral": Informational query or neither clearly positive nor negative

2. Consider:
   - Tone and word choice
   - Urgency markers
   - Emotional indicators
   - Context of the issue

3. Confidence scoring:
   - High (0.8-1.0): Clear sentiment indicators
   - Medium (0.5-0.79): Some ambiguity
   - Low (0.0-0.49): Mixed signals or unclear

Return JSON:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "reasoning": "detailed explanation referencing specific words/phrases"
}}
"""

# Save prompt v2
with open('prompt_v2.txt', 'w') as f:
    f.write(PROMPT_V2)

print("Prompt v2 created with improvements.")

## 8. Test Prompt v2

In [None]:
# Test v2 on same 10 emails
results_v2 = []

for idx, row in test_emails.iterrows():
    result = analyze_sentiment(row['subject'], row['body'], PROMPT_V2, client)
    results_v2.append({
        'email_id': row['email_id'],
        'subject': row['subject'],
        'sentiment': result['sentiment'],
        'confidence': result['confidence'],
        'reasoning': result['reasoning']
    })
    print(f"Email {row['email_id']}: {result['sentiment']} (confidence: {result['confidence']})")

# Save results
results_v2_df = pd.DataFrame(results_v2)
results_v2_df.to_json('results_v2.json', orient='records', indent=2)

## 9. Compare v1 vs v2

In [None]:
# Create comparison DataFrame
comparison = pd.DataFrame({
    'email_id': results_v1_df['email_id'],
    'subject': results_v1_df['subject'],
    'v1_sentiment': results_v1_df['sentiment'],
    'v1_confidence': results_v1_df['confidence'],
    'v2_sentiment': results_v2_df['sentiment'],
    'v2_confidence': results_v2_df['confidence']
})

# Identify changes
comparison['changed'] = comparison['v1_sentiment'] != comparison['v2_sentiment']

print("\nComparison of v1 vs v2:")
print(comparison)

print(f"\nNumber of changes: {comparison['changed'].sum()}")
print(f"\nAverage confidence v1: {comparison['v1_confidence'].mean():.3f}")
print(f"Average confidence v2: {comparison['v2_confidence'].mean():.3f}")

# Show cases where prediction changed
if comparison['changed'].any():
    print("\nEmails where prediction changed:")
    print(comparison[comparison['changed']])

## 10. Document Improvements

In [None]:
# TODO: Document in evaluation_report.md:
# 1. What failed in v1
# 2. What was improved in v2
# 3. How to evaluate prompts systematically

print("\nNext steps:")
print("1. Manually review all results")
print("2. Calculate accuracy (need ground truth labels)")
print("3. Update evaluation_report.md with findings")
print("4. Document systematic evaluation process")

## 11. Additional Analysis

In [None]:
# Sentiment distribution
print("\nSentiment Distribution:")
print("\nv1:")
print(results_v1_df['sentiment'].value_counts())
print("\nv2:")
print(results_v2_df['sentiment'].value_counts())

# Confidence distribution
print("\nConfidence Stats:")
print("\nv1:")
print(results_v1_df['confidence'].describe())
print("\nv2:")
print(results_v2_df['confidence'].describe())