# Lightweight Amharic NER Training

This notebook is optimized for systems with limited resources (high CPU/Memory usage).
It uses minimal memory and CPU to avoid kernel crashes.

In [None]:
# Cell 1: Setup and Memory Management
import os
import sys
import gc
import torch
import warnings
warnings.filterwarnings('ignore')

# Force CPU and limit resources
os.environ['CUDA_VISIBLE_DEVICES'] = ''
torch.set_num_threads(1)

# Add src to path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Clear memory
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

print("✅ Memory management setup complete")

In [None]:
# Cell 2: Import Lightweight Trainer
try:
    from src.lightweight_trainer import LightweightNERTrainer, LightweightConfig
    print("✅ Lightweight trainer imported successfully")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please ensure all dependencies are installed")
    
# Clear memory again
gc.collect()

In [None]:
# Cell 3: Check Data Availability
conll_path = '../data/annotated/amharic_ner.conll'
if os.path.exists(conll_path):
    print(f"✅ Found CONLL data: {conll_path}")
    
    # Count lines to estimate size
    with open(conll_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        sentences = sum(1 for line in lines if line.strip() == '')
        print(f"   Estimated sentences: {sentences}")
else:
    print("❌ CONLL data not found")
    print("Please ensure data/annotated/amharic_ner.conll exists")

In [None]:
# Cell 4: Initialize Lightweight Trainer
print("\n=== Initializing Lightweight Trainer ===\n")

# Configure for minimal resources
config = LightweightConfig(
    model_name="distilbert-base-multilingual-cased",  # Smaller model
    batch_size=2,  # Very small batch
    num_epochs=1,  # Single epoch
    max_length=128  # Short sequences
)

trainer = LightweightNERTrainer(config)
print(f"✅ Trainer initialized with model: {config.model_name}")
print(f"   Batch size: {config.batch_size}")
print(f"   Epochs: {config.num_epochs}")
print(f"   Max length: {config.max_length}")

# Clear memory
gc.collect()

In [None]:
# Cell 5: Load Data (Limited Samples)
print("\n=== Loading Data (Limited Samples) ===\n")

try:
    # Load only 30 samples to save memory
    dataset = trainer.load_conll_data(conll_path, max_samples=30)
    print(f"✅ Data loaded: {len(dataset['train'])} training, {len(dataset['validation'])} validation samples")
    
    # Clear memory
    gc.collect()
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("This might be due to memory constraints or missing data")

In [None]:
# Cell 6: Train Model (Lightweight)
print("\n=== Training Model (Lightweight) ===\n")

try:
    # Create output directory
    output_dir = "../models/lightweight_ner"
    os.makedirs(output_dir, exist_ok=True)
    
    print("Starting lightweight training...")
    print("This may take a few minutes but uses minimal resources")
    
    # Train model
    train_result = trainer.train(dataset, output_dir)
    
    print(f"✅ Training completed! Loss: {train_result.training_loss:.4f}")
    
    # Clear memory
    gc.collect()
    
except Exception as e:
    print(f"❌ Training error: {e}")
    print("This might be due to insufficient memory or other resource constraints")

In [None]:
# Cell 7: Test Predictions
print("\n=== Testing Model Predictions ===\n")

try:
    # Test texts
    test_texts = [
        "LCD Writing Tablet ዋጋ 550 ብር",
        "ስልክ ዋጋ 2500 ETB",
        "ኮምፒዩተር ዋጋ 15000 ብር"
    ]
    
    for i, text in enumerate(test_texts):
        predictions = trainer.predict(text)
        print(f"Text {i+1}: {text}")
        print("Predictions:")
        for token, label in predictions:
            if label != 'O':
                print(f"  {token} -> {label}")
        print()
    
    print("✅ Prediction testing completed!")
    
except Exception as e:
    print(f"❌ Prediction error: {e}")

In [None]:
# Cell 8: Generate Simple Analytics
print("\n=== Generating Simple Analytics ===\n")

try:
    import pandas as pd
    
    # Load processed data if available
    data_path = '../data/processed/telegram_processed.csv'
    if os.path.exists(data_path):
        # Load only first 100 rows to save memory
        df = pd.read_csv(data_path, nrows=100)
        
        print("Simple Vendor Analytics:")
        print(f"  Total messages: {len(df)}")
        print(f"  Unique channels: {df['channel'].nunique()}")
        print(f"  Average views: {df['views'].mean():.2f}")
        
        # Simple lending score calculation
        avg_views = df['views'].mean()
        lending_score = min(avg_views / 1000 * 100, 100)  # Simple scoring
        
        print(f"  Estimated lending score: {lending_score:.2f}%")
        
    else:
        print("⚠️  Processed data not found, skipping analytics")
    
except Exception as e:
    print(f"❌ Analytics error: {e}")

In [None]:
# Cell 9: Summary and Cleanup
print("\n=== Summary ===\n")

print("✅ Lightweight NER Training Completed!")
print("\nGenerated files:")
models_dir = "../models"
if os.path.exists(models_dir):
    for item in os.listdir(models_dir):
        print(f"  - {item}")

print("\nKey Features:")
print("  - Minimal memory usage")
print("  - CPU-only training")
print("  - Small batch sizes")
print("  - Limited data samples")
print("  - Single epoch training")

print("\nNext Steps:")
print("  1. If this worked, you can gradually increase batch_size and num_epochs")
print("  2. For better performance, consider using a machine with more RAM")
print("  3. The trained model is saved in ../models/lightweight_ner/")

# Final memory cleanup
gc.collect()
print("\n✅ Memory cleanup completed")