# Week 12 Lab: Ethics & Fairness in NLP

## Learning Objectives
- Understand and measure bias in NLP models
- Implement fairness metrics
- Explore toxicity detection
- Practice responsible AI deployment

## Prerequisites
```bash
pip install transformers torch numpy matplotlib
```

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
from collections import defaultdict

# Setup
print('Week 12: Ethics & Fairness in NLP')
print('=' * 50)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## Part 1: Understanding Bias in Language Models

Language models can encode societal biases present in their training data.

In [None]:
# Load a masked language model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()

# Create a fill-mask pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=5)

print(f"Model: {model_name}")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")

In [None]:
# Test for gender bias in occupations
def test_occupation_bias(occupation):
    """Test gender association with an occupation"""
    templates = [
        f"The {occupation} said [MASK] would finish the work.",
        f"The {occupation} finished [MASK] work.",
    ]
    
    results = {'male': 0, 'female': 0}
    male_words = {'he', 'his', 'him'}
    female_words = {'she', 'her', 'hers'}
    
    for template in templates:
        predictions = fill_mask(template)
        for pred in predictions:
            token = pred['token_str'].lower()
            if token in male_words:
                results['male'] += pred['score']
            elif token in female_words:
                results['female'] += pred['score']
    
    total = results['male'] + results['female']
    if total > 0:
        results['male'] /= total
        results['female'] /= total
    
    return results

# Test various occupations
occupations = ['doctor', 'nurse', 'engineer', 'teacher', 'CEO', 'secretary', 'scientist', 'receptionist']

print("Gender Bias in Occupation Associations:")
print("-" * 50)
bias_results = {}
for occ in occupations:
    result = test_occupation_bias(occ)
    bias_results[occ] = result
    print(f"{occ:15} Male: {result['male']:.1%}  Female: {result['female']:.1%}")

In [None]:
# Visualize occupation bias
fig, ax = plt.subplots(figsize=(12, 5))

x = np.arange(len(occupations))
male_scores = [bias_results[occ]['male'] for occ in occupations]
female_scores = [bias_results[occ]['female'] for occ in occupations]

width = 0.35
bars1 = ax.bar(x - width/2, male_scores, width, label='Male', color='#3333B2')
bars2 = ax.bar(x + width/2, female_scores, width, label='Female', color='#FF7F0E')

ax.axhline(0.5, color='gray', linestyle='--', alpha=0.5, label='Equal (50%)')
ax.set_xlabel('Occupation', fontsize=12)
ax.set_ylabel('Gender Association Score', fontsize=12)
ax.set_title('Gender Bias in BERT Occupation Associations', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(occupations, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

## Part 2: Measuring Bias with WEAT

Word Embedding Association Test (WEAT) quantifies bias in word embeddings.

In [None]:
def get_word_embedding(word, tokenizer, model):
    """Get the embedding for a word from BERT"""
    inputs = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model.bert(**inputs)
    # Use [CLS] token embedding
    return outputs.last_hidden_state[0, 0].numpy()

def cosine_similarity(a, b):
    """Compute cosine similarity between two vectors"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def weat_score(target1, target2, attribute1, attribute2, tokenizer, model):
    """
    Compute WEAT score (simplified version).
    
    Higher positive score = target1 more associated with attribute1
    Higher negative score = target1 more associated with attribute2
    """
    # Get embeddings
    t1_emb = [get_word_embedding(w, tokenizer, model) for w in target1]
    t2_emb = [get_word_embedding(w, tokenizer, model) for w in target2]
    a1_emb = [get_word_embedding(w, tokenizer, model) for w in attribute1]
    a2_emb = [get_word_embedding(w, tokenizer, model) for w in attribute2]
    
    def association(word_emb, attr1_embs, attr2_embs):
        """Mean similarity to attr1 minus mean similarity to attr2"""
        s1 = np.mean([cosine_similarity(word_emb, a) for a in attr1_embs])
        s2 = np.mean([cosine_similarity(word_emb, a) for a in attr2_embs])
        return s1 - s2
    
    # Compute associations for target sets
    s_t1 = np.mean([association(t, a1_emb, a2_emb) for t in t1_emb])
    s_t2 = np.mean([association(t, a1_emb, a2_emb) for t in t2_emb])
    
    return s_t1 - s_t2

# Define word sets for gender bias test
male_names = ['john', 'paul', 'mike', 'kevin', 'steve']
female_names = ['mary', 'susan', 'lisa', 'sarah', 'emma']
career_words = ['career', 'professional', 'office', 'business', 'salary']
family_words = ['family', 'home', 'children', 'parents', 'marriage']

score = weat_score(male_names, female_names, career_words, family_words, tokenizer, model)
print(f"WEAT Score (Gender-Career): {score:.4f}")
print(f"Interpretation: {'Male names more associated with career' if score > 0 else 'Female names more associated with career'}")

## Part 3: Fairness Metrics

Let's implement common fairness metrics for classification models.

In [None]:
def demographic_parity(predictions, sensitive_attribute):
    """
    Measure demographic parity (equal positive rate across groups).
    
    Perfect fairness = ratio of 1.0
    """
    groups = np.unique(sensitive_attribute)
    positive_rates = {}
    
    for group in groups:
        mask = sensitive_attribute == group
        positive_rates[group] = np.mean(predictions[mask])
    
    # Return ratio of min to max positive rate
    rates = list(positive_rates.values())
    return min(rates) / max(rates) if max(rates) > 0 else 1.0, positive_rates

def equalized_odds(predictions, labels, sensitive_attribute):
    """
    Measure equalized odds (equal TPR and FPR across groups).
    """
    groups = np.unique(sensitive_attribute)
    tpr = {}
    fpr = {}
    
    for group in groups:
        mask = sensitive_attribute == group
        group_pred = predictions[mask]
        group_label = labels[mask]
        
        # True Positive Rate
        pos_mask = group_label == 1
        if pos_mask.sum() > 0:
            tpr[group] = np.mean(group_pred[pos_mask])
        else:
            tpr[group] = 0
        
        # False Positive Rate
        neg_mask = group_label == 0
        if neg_mask.sum() > 0:
            fpr[group] = np.mean(group_pred[neg_mask])
        else:
            fpr[group] = 0
    
    return tpr, fpr

# Simulate predictions
np.random.seed(42)
n_samples = 1000

# Generate sensitive attribute (e.g., gender)
sensitive = np.random.choice(['A', 'B'], n_samples)

# Generate true labels
labels = np.random.randint(0, 2, n_samples)

# Generate biased predictions (group A gets higher positive rate)
predictions = np.zeros(n_samples)
predictions[sensitive == 'A'] = np.random.binomial(1, 0.7, (sensitive == 'A').sum())
predictions[sensitive == 'B'] = np.random.binomial(1, 0.4, (sensitive == 'B').sum())

# Calculate fairness metrics
dp_ratio, positive_rates = demographic_parity(predictions, sensitive)
tpr, fpr = equalized_odds(predictions, labels, sensitive)

print("Fairness Metrics:")
print("-" * 50)
print(f"\nDemographic Parity:")
print(f"  Positive rate (Group A): {positive_rates['A']:.1%}")
print(f"  Positive rate (Group B): {positive_rates['B']:.1%}")
print(f"  DP Ratio: {dp_ratio:.3f} (1.0 = perfect parity)")

print(f"\nEqualized Odds:")
print(f"  TPR (Group A): {tpr['A']:.1%}, TPR (Group B): {tpr['B']:.1%}")
print(f"  FPR (Group A): {fpr['A']:.1%}, FPR (Group B): {fpr['B']:.1%}")

In [None]:
# Visualize fairness metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Demographic Parity
ax = axes[0]
bars = ax.bar(['Group A', 'Group B'], [positive_rates['A'], positive_rates['B']], 
              color=['#3333B2', '#FF7F0E'])
ax.axhline(np.mean(predictions), color='gray', linestyle='--', label='Average')
ax.set_ylabel('Positive Prediction Rate')
ax.set_title('Demographic Parity', fontsize=12, fontweight='bold')
ax.set_ylim(0, 1)
ax.legend()

# TPR Comparison
ax = axes[1]
ax.bar(['Group A', 'Group B'], [tpr['A'], tpr['B']], color=['#3333B2', '#FF7F0E'])
ax.set_ylabel('True Positive Rate')
ax.set_title('TPR by Group', fontsize=12, fontweight='bold')
ax.set_ylim(0, 1)

# FPR Comparison
ax = axes[2]
ax.bar(['Group A', 'Group B'], [fpr['A'], fpr['B']], color=['#3333B2', '#FF7F0E'])
ax.set_ylabel('False Positive Rate')
ax.set_title('FPR by Group', fontsize=12, fontweight='bold')
ax.set_ylim(0, 1)

plt.suptitle('Fairness Metrics Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Part 4: Toxicity Detection

Let's explore how to detect and measure toxic content.

In [None]:
# Simple toxicity word list approach (for demonstration)
# In practice, use trained models like Perspective API or Detoxify

toxic_keywords = {
    'hate': 0.8,
    'stupid': 0.5,
    'idiot': 0.7,
    'terrible': 0.3,
    'awful': 0.3,
    'worst': 0.4,
    'horrible': 0.4,
}

def simple_toxicity_score(text):
    """Calculate a simple toxicity score based on keyword matching."""
    words = text.lower().split()
    scores = [toxic_keywords.get(word, 0) for word in words]
    return max(scores) if scores else 0

# Test sentences
test_sentences = [
    "This is a great product!",
    "I hate this terrible service.",
    "The weather is nice today.",
    "You are such an idiot.",
    "This movie was the worst ever.",
]

print("Toxicity Detection Results:")
print("-" * 60)
for sentence in test_sentences:
    score = simple_toxicity_score(sentence)
    level = 'HIGH' if score > 0.5 else 'MEDIUM' if score > 0.3 else 'LOW'
    print(f"[{level:6}] {score:.2f} | {sentence}")

In [None]:
# Analyze toxicity distribution in a corpus
sample_corpus = [
    "Great job on the project!",
    "This is absolutely terrible work.",
    "I really appreciate your help.",
    "What a stupid idea.",
    "The presentation was excellent.",
    "I hate having to deal with this.",
    "Thank you for your patience.",
    "This is the worst experience ever.",
    "Looking forward to our next meeting.",
    "How can you be such an idiot?",
] * 10  # Replicate for visualization

# Calculate toxicity scores
toxicity_scores = [simple_toxicity_score(text) for text in sample_corpus]

# Visualize distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Histogram
ax1.hist(toxicity_scores, bins=20, color='#3333B2', alpha=0.7, edgecolor='black')
ax1.axvline(np.mean(toxicity_scores), color='red', linestyle='--', label=f'Mean: {np.mean(toxicity_scores):.2f}')
ax1.set_xlabel('Toxicity Score')
ax1.set_ylabel('Frequency')
ax1.set_title('Toxicity Score Distribution', fontsize=12, fontweight='bold')
ax1.legend()

# Category breakdown
categories = ['Low (<0.3)', 'Medium (0.3-0.5)', 'High (>0.5)']
counts = [
    sum(1 for s in toxicity_scores if s < 0.3),
    sum(1 for s in toxicity_scores if 0.3 <= s <= 0.5),
    sum(1 for s in toxicity_scores if s > 0.5)
]
colors = ['#2CA02C', '#FF7F0E', '#D62728']
ax2.pie(counts, labels=categories, colors=colors, autopct='%1.1f%%', startangle=90)
ax2.set_title('Toxicity Categories', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## Part 5: Bias Mitigation Strategies

Let's explore techniques to reduce bias in models.

In [None]:
def counterfactual_augmentation(text, word_pairs):
    """
    Create counterfactual examples by swapping identity terms.
    
    This helps balance training data.
    """
    augmented = [text]
    
    for word1, word2 in word_pairs:
        if word1 in text.lower():
            # Replace preserving case
            new_text = text.replace(word1, word2).replace(word1.capitalize(), word2.capitalize())
            augmented.append(new_text)
        if word2 in text.lower():
            new_text = text.replace(word2, word1).replace(word2.capitalize(), word1.capitalize())
            augmented.append(new_text)
    
    return list(set(augmented))

# Gender swap pairs
gender_pairs = [
    ('he', 'she'),
    ('him', 'her'),
    ('his', 'her'),
    ('man', 'woman'),
    ('boy', 'girl'),
    ('father', 'mother'),
]

# Test counterfactual augmentation
test_text = "The man said he would finish his project."
augmented = counterfactual_augmentation(test_text, gender_pairs)

print("Counterfactual Augmentation:")
print("-" * 50)
print(f"Original: {test_text}")
print(f"Augmented versions:")
for text in augmented:
    if text != test_text:
        print(f"  -> {text}")

In [None]:
def threshold_calibration(predictions, sensitive_attribute, target_rate=None):
    """
    Calibrate decision thresholds for each group to achieve demographic parity.
    """
    if target_rate is None:
        target_rate = np.mean(predictions)
    
    groups = np.unique(sensitive_attribute)
    thresholds = {}
    
    for group in groups:
        group_preds = predictions[sensitive_attribute == group]
        # Find threshold that gives target positive rate
        thresholds[group] = np.percentile(group_preds, 100 * (1 - target_rate))
    
    # Apply group-specific thresholds
    calibrated = np.zeros_like(predictions)
    for group in groups:
        mask = sensitive_attribute == group
        calibrated[mask] = (predictions[mask] > thresholds[group]).astype(int)
    
    return calibrated, thresholds

# Generate continuous predictions
np.random.seed(42)
n_samples = 1000
sensitive = np.random.choice(['A', 'B'], n_samples)

# Biased continuous predictions
predictions_cont = np.zeros(n_samples)
predictions_cont[sensitive == 'A'] = np.random.beta(3, 2, (sensitive == 'A').sum())
predictions_cont[sensitive == 'B'] = np.random.beta(2, 3, (sensitive == 'B').sum())

# Original binary predictions (threshold = 0.5)
original_binary = (predictions_cont > 0.5).astype(int)

# Calibrated predictions
calibrated_binary, thresholds = threshold_calibration(predictions_cont, sensitive)

# Compare results
print("Threshold Calibration Results:")
print("-" * 50)
print(f"\nOriginal (threshold=0.5 for all):")
for group in ['A', 'B']:
    rate = np.mean(original_binary[sensitive == group])
    print(f"  Group {group}: {rate:.1%} positive")

print(f"\nCalibrated (group-specific thresholds):")
for group in ['A', 'B']:
    rate = np.mean(calibrated_binary[sensitive == group])
    print(f"  Group {group}: {rate:.1%} positive (threshold={thresholds[group]:.3f})")

## Exercises

1. **Bias Detection**: Test BERT for racial or religious bias using appropriate word sets
2. **Fairness**: Implement additional fairness metrics (predictive parity, calibration)
3. **Mitigation**: Apply debiasing techniques to a sentiment classifier
4. **Audit**: Create a model audit report for a text classification system

In [None]:
# Exercise starter: Model audit template
def model_audit_report(model_name, predictions, labels, sensitive_attr, groups):
    """
    Generate a fairness audit report for a model.
    """
    report = {
        'model': model_name,
        'total_samples': len(predictions),
        'overall_accuracy': np.mean(predictions == labels),
        'group_metrics': {}
    }
    
    for group in groups:
        mask = sensitive_attr == group
        group_preds = predictions[mask]
        group_labels = labels[mask]
        
        report['group_metrics'][group] = {
            'size': mask.sum(),
            'accuracy': np.mean(group_preds == group_labels),
            'positive_rate': np.mean(group_preds),
            'base_rate': np.mean(group_labels),
        }
    
    return report

# Generate sample data
labels = np.random.randint(0, 2, n_samples)
report = model_audit_report('SampleModel', calibrated_binary, labels, sensitive, ['A', 'B'])

print("Model Audit Report")
print("=" * 50)
print(f"Model: {report['model']}")
print(f"Total Samples: {report['total_samples']}")
print(f"Overall Accuracy: {report['overall_accuracy']:.1%}")
print(f"\nGroup Metrics:")
for group, metrics in report['group_metrics'].items():
    print(f"  Group {group}:")
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"    {metric}: {value:.1%}")
        else:
            print(f"    {metric}: {value}")

## Summary

In this lab, we explored:

1. **Bias detection**: Using MLM probing and WEAT to identify biases
2. **Fairness metrics**: Demographic parity and equalized odds
3. **Toxicity detection**: Simple keyword-based approaches
4. **Bias mitigation**: Counterfactual augmentation and threshold calibration
5. **Model auditing**: Creating fairness reports

**Key Takeaways**:
- Language models encode societal biases from training data
- Fairness requires careful metric selection based on context
- Multiple mitigation strategies exist, each with tradeoffs
- Regular auditing is essential for responsible AI deployment