# Multilingual Bias Audit - Model Evaluation
## Notebook 01: Run toxicity detection models on EN/BN pairs

**Author**: Farahnaz Reza  
**Project**: Evaluating language bias in toxicity detection  
**Dataset**: 30 paired prompts (English + Bangla)

---

## Step 1: Mount Drive and Load Dataset

In [None]:
# Mount Google Drive and load dataset
from google.colab import drive
import pandas as pd

# Mount your Google Drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/multilingual-bias-audit/data/pairs.csv')

# Display basic info
print("✓ Dataset loaded successfully!")
print(f"\nTotal samples: {len(df)}")
print(f"Languages: {df['language'].value_counts().to_dict()}")
print(f"Categories: {df['category'].value_counts().to_dict()}")
print("\nFirst 3 pairs:")
print(df.head(6))

## Step 2: Install Required Libraries

In [None]:
# Install toxicity detection libraries
!pip install -q transformers detoxify
print("✓ Libraries installed successfully!")

## Step 3: Initialize Models

In [None]:
from transformers import pipeline
from detoxify import Detoxify
import warnings
warnings.filterwarnings('ignore')

print("Initializing toxicity detection models...")
print("=" * 50)

# Model 1: Unitary Toxic-BERT (multilingual)
try:
    toxic_classifier = pipeline(
        "text-classification",
        model="unitary/toxic-bert",
        tokenizer="unitary/toxic-bert"
    )
    print("✓ Loaded: unitary/toxic-bert")
except Exception as e:
    print(f"✗ Error loading toxic-bert: {e}")
    toxic_classifier = None

# Model 2: Detoxify (multilingual)
try:
    detoxify_model = Detoxify('multilingual')
    print("✓ Loaded: Detoxify (multilingual)")
except Exception as e:
    print(f"✗ Error loading Detoxify: {e}")
    detoxify_model = None

print("\n✓ Model initialization complete!")

## Step 4: Run Inference on All Texts

In [None]:
def classify_with_toxic_bert(text):
    """Run toxic-bert classification"""
    if toxic_classifier is None:
        return None, None
    
    try:
        result = toxic_classifier(text)[0]
        label = result['label']
        score = result['score']
        return label, score
    except Exception as e:
        print(f"Error classifying: {text[:50]}... | {e}")
        return None, None

def classify_with_detoxify(text):
    """Run Detoxify classification"""
    if detoxify_model is None:
        return None, None
    
    try:
        results = detoxify_model.predict(text)
        toxicity_score = results['toxicity']
        label = 'toxic' if toxicity_score > 0.5 else 'non-toxic'
        return label, toxicity_score
    except Exception as e:
        print(f"Error classifying: {text[:50]}... | {e}")
        return None, None

# Run both models on all texts
print("Running inference on all texts...")
print("=" * 50)

results = []

for idx, row in df.iterrows():
    text = row['text']
    pair_id = row['pair_id']
    language = row['language']
    category = row['category']
    expected = row['expected_label']
    
    # Model 1: Toxic-BERT
    bert_label, bert_score = classify_with_toxic_bert(text)
    
    # Model 2: Detoxify
    detox_label, detox_score = classify_with_detoxify(text)
    
    results.append({
        'pair_id': pair_id,
        'language': language,
        'text': text,
        'category': category,
        'expected_label': expected,
        'toxic_bert_label': bert_label,
        'toxic_bert_score': bert_score,
        'detoxify_label': detox_label,
        'detoxify_score': detox_score
    })
    
    print(f"Processed {idx+1}/{len(df)}: pair {pair_id} ({language})")

# Create results DataFrame
results_df = pd.DataFrame(results)

print("\n✓ Inference complete!")
print(f"\nResults preview:")
print(results_df[['pair_id', 'language', 'category', 'detoxify_label', 'detoxify_score']].head(10))

## Step 5: Quick Analysis - Language Bias Detection

In [None]:
print("=" * 50)
print("BIAS ANALYSIS")
print("=" * 50)

def analyze_language_bias(results_df, model_name, label_col, score_col):
    print(f"\n--- {model_name} ---")
    
    # Overall accuracy by language
    for lang in ['en', 'bn']:
        lang_df = results_df[results_df['language'] == lang]
        if label_col in lang_df.columns and lang_df[label_col].notna().any():
            correct = (lang_df['expected_label'] == lang_df[label_col]).sum()
            total = len(lang_df)
            accuracy = correct / total if total > 0 else 0
            print(f"{lang.upper()} accuracy: {accuracy:.2%} ({correct}/{total})")
    
    # Label agreement between EN and BN for each pair
    agreements = 0
    total_pairs = results_df['pair_id'].nunique()
    
    for pair_id in results_df['pair_id'].unique():
        pair_df = results_df[results_df['pair_id'] == pair_id]
        if len(pair_df) == 2 and label_col in pair_df.columns:
            en_label = pair_df[pair_df['language'] == 'en'][label_col].values
            bn_label = pair_df[pair_df['language'] == 'bn'][label_col].values
            if len(en_label) > 0 and len(bn_label) > 0:
                if en_label[0] == bn_label[0]:
                    agreements += 1
    
    agreement_rate = agreements / total_pairs if total_pairs > 0 else 0
    print(f"Label agreement (EN/BN pairs): {agreement_rate:.2%} ({agreements}/{total_pairs})")

# Analyze both models
if 'toxic_bert_label' in results_df.columns:
    analyze_language_bias(results_df, "Toxic-BERT", 'toxic_bert_label', 'toxic_bert_score')

if 'detoxify_label' in results_df.columns:
    analyze_language_bias(results_df, "Detoxify", 'detoxify_label', 'detoxify_score')

## Step 6: Identify Harmful Error Patterns

In [None]:
print("\n" + "="*50)
print("CONCERNING CASES (Harmful Errors)")
print("="*50)

concerning_cases = []

for pair_id in results_df['pair_id'].unique():
    pair_df = results_df[results_df['pair_id'] == pair_id]
    
    if len(pair_df) == 2:
        en_row = pair_df[pair_df['language'] == 'en'].iloc[0]
        bn_row = pair_df[pair_df['language'] == 'bn'].iloc[0]
        
        # Using Detoxify results
        if 'detoxify_label' in results_df.columns:
            en_pred = en_row['detoxify_label']
            bn_pred = bn_row['detoxify_label']
            expected = en_row['expected_label']
            
            # Pattern 1: Toxic content marked safe in BN (false negative)
            if expected == 'toxic' and en_pred == 'toxic' and bn_pred == 'non-toxic':
                concerning_cases.append({
                    'pair_id': pair_id,
                    'pattern': 'False Negative in BN',
                    'description': 'Toxic content detected in EN but missed in BN',
                    'en_text': en_row['text'][:60],
                    'bn_text': bn_row['text'][:60],
                    'harm': 'Users unprotected from harassment'
                })
            
            # Pattern 2: Neutral content flagged in BN (false positive)
            if expected == 'neutral' and en_pred == 'non-toxic' and bn_pred == 'toxic':
                concerning_cases.append({
                    'pair_id': pair_id,
                    'pattern': 'False Positive in BN',
                    'description': 'Neutral content flagged as toxic only in BN',
                    'en_text': en_row['text'][:60],
                    'bn_text': bn_row['text'][:60],
                    'harm': 'Legitimate speech censored'
                })

if concerning_cases:
    print(f"\n⚠️ Found {len(concerning_cases)} concerning patterns:\n")
    for case in concerning_cases:
        print(f"Pair {case['pair_id']}: {case['pattern']}")
        print(f"  Impact: {case['harm']}")
        print(f"  EN: {case['en_text']}...")
        print(f"  BN: {case['bn_text']}...\n")
else:
    print("\n✓ No major concerning patterns detected in this batch.")
    print("(Expand dataset to 30 pairs for fuller analysis)")

## Step 7: Save Results to Drive

In [None]:
# Save results back to your Drive
output_path = '/content/drive/MyDrive/multilingual-bias-audit/results/'

# Create results folder if it doesn't exist
import os
os.makedirs(output_path, exist_ok=True)

# Save predictions
results_df.to_csv(output_path + 'predictions.csv', index=False)
print(f"✓ Results saved to: {output_path}predictions.csv")

# Save summary statistics
with open(output_path + 'summary.txt', 'w', encoding='utf-8') as f:
    f.write("MULTILINGUAL BIAS AUDIT - SUMMARY\n")
    f.write("="*50 + "\n\n")
    f.write(f"Total pairs analyzed: {results_df['pair_id'].nunique()}\n")
    f.write(f"Total samples: {len(results_df)}\n\n")
    
    if concerning_cases:
        f.write(f"Concerning patterns found: {len(concerning_cases)}\n\n")
        for case in concerning_cases:
            f.write(f"Pair {case['pair_id']}: {case['pattern']}\n")
            f.write(f"  {case['description']}\n")
            f.write(f"  Impact: {case['harm']}\n\n")

print(f"✓ Summary saved to: {output_path}summary.txt")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)
print("\nNext steps:")
print("1. Review predictions.csv for detailed model outputs")
print("2. Expand dataset to 30 pairs for comprehensive analysis")
print("3. Document key findings in your GitHub repository")
print("4. Create visualizations of bias patterns")