# Data Preparation for Idiom-Aware Translation

This notebook processes the raw Excel dataset containing English-Sinhala idiom pairs and prepares it for model training.

## Steps:
1. Load the Excel dataset
2. Validate data quality
3. Split into train/test sets (first 50 rows for testing)
4. Tag idioms with `<IDIOM>` markers
5. Export to JSON format
6. Display statistics and visualizations

In [None]:
import sys
sys.path.append('..')

from src.data_processor import (
    load_excel,
    validate_data,
    split_data,
    convert_to_json_format,
    export_to_json,
    process_dataset
)
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_colwidth', 100)
sns.set_style('whitegrid')

print("✓ Imports successful")

## 1. Load and Explore Raw Data

In [None]:
# Load the Excel file
excel_path = '../data/raw/idiom_dataset.xlsx'
df = load_excel(excel_path)

# Display basic information
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Show first few rows
df.head()

## 2. Data Validation

In [None]:
# Validate data quality
stats = validate_data(df)

print("\n=== Data Quality Report ===")
print(f"Total rows: {stats['total_rows']}")
print(f"\nMissing values per column:")
for col, count in stats['missing_values'].items():
    if count > 0:
        print(f"  {col}: {count}")

if stats['evaluation_counts']:
    print(f"\nEvaluation status:")
    for status, count in stats['evaluation_counts'].items():
        print(f"  {status}: {count}")

## 3. Process and Split Dataset

In [None]:
# Process the full dataset
output_dir = '../data/processed'
test_size = 50

processing_stats = process_dataset(
    excel_path=excel_path,
    output_dir=output_dir,
    test_size=test_size
)

print("\n=== Processing Complete ===")
print(f"Training examples: {processing_stats['train_examples']}")
print(f"Test examples: {processing_stats['test_examples']}")
print(f"\nOutput files:")
for key, path in processing_stats['output_files'].items():
    print(f"  {key}: {path}")

## 4. Inspect Processed Data

In [None]:
# Load and display sample processed data
with open('../data/processed/test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print("Sample test example:")
print(json.dumps(test_data[0], indent=2, ensure_ascii=False))

# Verify idiom tagging
print("\n=== Idiom Tagging Examples ===")
for i, example in enumerate(test_data[:3]):
    print(f"\nExample {i+1}:")
    print(f"Idiom: {example['idiom_en']}")
    print(f"Source: {example['source_en'][:100]}...")
    print(f"Tagged: {'<IDIOM>' in example['source_en']}")

## 5. Statistics and Visualizations

In [None]:
# Load training data for analysis
with open('../data/processed/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# Calculate statistics
total_idioms = len(set([ex['idiom_en'] for ex in train_data + test_data]))
validation_yes = len([ex for ex in train_data + test_data if ex['evaluation'] == 'Yes'])

print("=== Dataset Statistics ===")
print(f"Total unique idioms: {total_idioms}")
print(f"Training examples: {len(train_data)}")
print(f"Test examples: {len(test_data)}")
print(f"Validated examples: {validation_yes}")
print(f"Validation rate: {validation_yes / (len(train_data) + len(test_data)) * 100:.1f}%")

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Dataset split
split_data_viz = pd.DataFrame({
    'Split': ['Train', 'Test'],
    'Count': [len(train_data), len(test_data)]
})
axes[0].bar(split_data_viz['Split'], split_data_viz['Count'], color=['#3498db', '#e74c3c'])
axes[0].set_title('Train/Test Split', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Examples')
for i, v in enumerate(split_data_viz['Count']):
    axes[0].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')

# Evaluation status
eval_counts = pd.Series([ex['evaluation'] for ex in train_data + test_data]).value_counts()
axes[1].pie(eval_counts.values, labels=eval_counts.index, autopct='%1.1f%%', 
            colors=['#2ecc71', '#e67e22'], startangle=90)
axes[1].set_title('Evaluation Status', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/data_statistics.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to outputs/data_statistics.png")

In [None]:
# Analyze sentence lengths
train_lengths = [len(ex['source_en'].split()) for ex in train_data]
test_lengths = [len(ex['source_en'].split()) for ex in test_data]

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(train_lengths, bins=20, alpha=0.7, color='#3498db', edgecolor='black')
plt.title('Training Set - Sentence Length Distribution', fontweight='bold')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.axvline(sum(train_lengths)/len(train_lengths), color='red', linestyle='--', 
            label=f'Mean: {sum(train_lengths)/len(train_lengths):.1f}')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(test_lengths, bins=20, alpha=0.7, color='#e74c3c', edgecolor='black')
plt.title('Test Set - Sentence Length Distribution', fontweight='bold')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.axvline(sum(test_lengths)/len(test_lengths), color='red', linestyle='--',
            label=f'Mean: {sum(test_lengths)/len(test_lengths):.1f}')
plt.legend()

plt.tight_layout()
plt.savefig('../outputs/sentence_length_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to outputs/sentence_length_distribution.png")

## 6. Summary

Data preparation completed successfully! The dataset has been:
- ✅ Loaded and validated
- ✅ Split into train and test sets
- ✅ Idioms tagged with `<IDIOM>` markers
- ✅ Exported to JSON format
- ✅ Statistics calculated and visualized

**Next Step**: Run `02_data_augmentation.ipynb` to create augmented training examples.