# Inference and Testing

This notebook tests the fine-tuned NLLB model on the 50 test examples.

## Testing Pipeline:
1. Load the fine-tuned model
2. Load test dataset
3. Generate translations for all test examples
4. Display side-by-side comparisons
5. Save predictions for evaluation
6. Quick quality check

In [None]:
import sys
sys.path.append('..')

from src.inference import (
    load_trained_model,
    translate,
    batch_translate,
    translate_with_idiom
)
from src.trainer import load_config
import json
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

print("✓ Imports successful")

## 1. Load Configuration and Model

In [None]:
# Load configuration
config = load_config('../config/training_config.yaml')

# Load the fine-tuned model
model_path = config['paths']['final_model']
print(f"Loading model from: {model_path}")

model, tokenizer = load_trained_model(
    checkpoint_path=model_path,
    base_model=config['model']['base_model']
)

print("\n✓ Model loaded successfully")

## 2. Load Test Dataset

In [None]:
# Load test data
test_path = config['data']['test_json']

with open(test_path, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"Loaded {len(test_data)} test examples")
print("\nSample test example:")
print(json.dumps(test_data[0], indent=2, ensure_ascii=False))

## 3. Generate Translations

In [None]:
# Extract source texts
source_texts = [example['source_en'] for example in test_data]

# Generate translations in batches
print("Generating translations...")
predictions = batch_translate(
    texts=source_texts,
    model=model,
    tokenizer=tokenizer,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang'],
    max_length=config['training']['max_length'],
    batch_size=8
)

print(f"\n✓ Generated {len(predictions)} translations")

## 4. Display Side-by-Side Comparisons

In [None]:
# Display comparisons for first 10 examples
print("=" * 80)
print("TRANSLATION COMPARISONS")
print("=" * 80)

for i in range(min(10, len(test_data))):
    example = test_data[i]
    prediction = predictions[i]
    
    print(f"\n--- Example {i+1} ---")
    print(f"English Idiom: {example['idiom_en']}")
    print(f"Sinhala Idiom: {example['idiom_si']}")
    print(f"\nSource (EN): {example['source_en']}")
    print(f"\nExpected (SI): {example['target_si']}")
    print(f"\nPredicted (SI): {prediction}")
    
    # Quick check if Sinhala idiom is present
    idiom_present = example['idiom_si'] in prediction
    print(f"\nIdiom Correct: {'✅ Yes' if idiom_present else '❌ No'}")
    print("-" * 80)

## 5. Create Results DataFrame

In [None]:
# Create a DataFrame for easy viewing
results_df = pd.DataFrame({
    'Idiom_EN': [ex['idiom_en'] for ex in test_data],
    'Idiom_SI': [ex['idiom_si'] for ex in test_data],
    'Source': [ex['source_en'] for ex in test_data],
    'Reference': [ex['target_si'] for ex in test_data],
    'Prediction': predictions,
    'Idiom_Found': [ex['idiom_si'] in pred for ex, pred in zip(test_data, predictions)]
})

# Display summary
print("\n=== Quick Summary ===")
print(f"Total examples: {len(results_df)}")
print(f"Idiom found in translation: {results_df['Idiom_Found'].sum()} / {len(results_df)}")
print(f"Idiom accuracy: {results_df['Idiom_Found'].sum() / len(results_df) * 100:.1f}%")

# Show first few rows
print("\nFirst 5 results:")
results_df.head()

## 6. Save Predictions

In [None]:
# Prepare results for saving
results = []
for i, (example, prediction) in enumerate(zip(test_data, predictions)):
    result = {
        'example_id': i,
        'idiom_en': example['idiom_en'],
        'idiom_si': example['idiom_si'],
        'source_en': example['source_en'],
        'reference_si': example['target_si'],
        'prediction_si': prediction,
        'idiom_present': example['idiom_si'] in prediction
    }
    results.append(result)

# Save to JSON
output_path = Path(config['outputs']['predictions'])
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✓ Results saved to {output_path}")
print(f"  Total predictions: {len(results)}")

## 7. Sample Translations Analysis

In [None]:
# Show some successful and unsuccessful translations
successful = results_df[results_df['Idiom_Found'] == True].head(3)
unsuccessful = results_df[results_df['Idiom_Found'] == False].head(3)

print("=== Successful Translations (Idiom Found) ===")
for idx, row in successful.iterrows():
    print(f"\nIdiom: {row['Idiom_EN']} → {row['Idiom_SI']}")
    print(f"Prediction: {row['Prediction'][:100]}...")

print("\n\n=== Unsuccessful Translations (Idiom Not Found) ===")
for idx, row in unsuccessful.iterrows():
    print(f"\nIdiom: {row['Idiom_EN']} → {row['Idiom_SI']}")
    print(f"Expected: {row['Reference'][:100]}...")
    print(f"Got: {row['Prediction'][:100]}...")

## Summary

Inference and testing completed!

- ✅ Fine-tuned model loaded
- ✅ 50 test examples translated
- ✅ Predictions saved to `outputs/predictions/test_results.json`
- ✅ Initial quality check performed

**Quick Results**:
- Total examples: (calculated during execution)
- Idiom accuracy: (calculated during execution)

**Next Step**: Run `05_evaluation.ipynb` for comprehensive evaluation with BLEU scores and detailed metrics.