# SMS Parser Tool - Comprehensive Testing

## Overview
This notebook tests the SMS Parser Tool against all 50 synthetic SMS messages generated in Milestone 2.

**Test Objectives:**
1. Parse all 50 SMS messages
2. Calculate parsing accuracy
3. Validate extracted data against ground truth
4. Analyze parsing performance by transaction type
5. Identify any edge cases or failures

**Author:** Alfred Munga  
**Date:** November 18, 2025  
**Milestone:** 3 - SMS Parser Tool Testing

## 1. Setup and Imports

In [None]:
import sys
import os

# Add project root to path
project_root = os.path.abspath('../')
sys.path.insert(0, project_root)

import pandas as pd
import numpy as np
from decimal import Decimal
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Import our SMS parser
from tools.sms_parser_tool import SMSParserTool

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Imports successful!")

## 2. Load Synthetic SMS Dataset

In [None]:
# Load the synthetic SMS data
sms_df = pd.read_csv('../data/synthetic/sms.csv')

print(f"Loaded {len(sms_df)} SMS messages")
print(f"\nColumns: {list(sms_df.columns)}")
print(f"\nTransaction type distribution:")
print(sms_df['transaction_type'].value_counts())

# Display first few messages
print("\nFirst 5 messages:")
sms_df.head()

## 3. Initialize SMS Parser

In [None]:
# Initialize the parser
parser = SMSParserTool()

print(f"SMS Parser initialized!")
print(f"Supported transaction types:")
for i, trans_type in enumerate(parser.transaction_types, 1):
    print(f"  {i}. {trans_type}")

## 4. Parse All SMS Messages

In [None]:
# Parse all messages
print("Parsing all SMS messages...\n")

sms_texts = sms_df['sms_text'].tolist()
parsed_results = parser.parse_bulk(sms_texts)

print(f"‚úì Parsed {len(parsed_results)} messages")

# Count successful vs failed parses
successful = sum(1 for r in parsed_results if 'error' not in r)
failed = len(parsed_results) - successful

print(f"\nüìä Parsing Results:")
print(f"  ‚úì Successful: {successful}")
print(f"  ‚úó Failed: {failed}")
print(f"  üìà Success Rate: {(successful/len(parsed_results)*100):.1f}%")

## 5. Analyze Parsing Accuracy by Transaction Type

In [None]:
# Create a dataframe of parsed results
parsed_df = pd.DataFrame(parsed_results)

# Add ground truth data
parsed_df['ground_truth_type'] = sms_df['transaction_type'].values
parsed_df['ground_truth_amount'] = sms_df['amount'].values
parsed_df['ground_truth_reference'] = sms_df['reference'].values

# Calculate accuracy by transaction type
print("\nüìä Accuracy by Transaction Type:\n")

accuracy_by_type = {}
for trans_type in sms_df['transaction_type'].unique():
    # Get ground truth messages of this type
    gt_indices = sms_df[sms_df['transaction_type'] == trans_type].index
    
    # Get parsed results for these messages
    parsed_subset = parsed_df.loc[gt_indices]
    
    # Count successful parses
    successful = sum(1 for _, row in parsed_subset.iterrows() 
                     if 'error' not in row and row.get('transaction_type') == trans_type)
    
    total = len(parsed_subset)
    accuracy = (successful / total * 100) if total > 0 else 0
    
    accuracy_by_type[trans_type] = accuracy
    print(f"  {trans_type:15s}: {successful:2d}/{total:2d} ({accuracy:5.1f}%)")

print(f"\nüìà Overall Accuracy: {(successful/len(sms_df)*100):.1f}%")

## 6. Visualize Parsing Performance

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Accuracy by Transaction Type
types = list(accuracy_by_type.keys())
accuracies = list(accuracy_by_type.values())
colors = ['green' if acc == 100 else 'orange' if acc >= 80 else 'red' for acc in accuracies]

axes[0].barh(types, accuracies, color=colors, alpha=0.7)
axes[0].set_xlabel('Accuracy (%)', fontsize=12)
axes[0].set_title('Parsing Accuracy by Transaction Type', fontsize=14, fontweight='bold')
axes[0].set_xlim(0, 105)
axes[0].axvline(x=100, color='green', linestyle='--', linewidth=2, alpha=0.5)
axes[0].grid(axis='x', alpha=0.3)

# Add percentage labels
for i, (type_name, acc) in enumerate(zip(types, accuracies)):
    axes[0].text(acc + 2, i, f'{acc:.0f}%', va='center', fontsize=10)

# Plot 2: Transaction Type Distribution
type_counts = sms_df['transaction_type'].value_counts()
axes[1].pie(type_counts.values, labels=type_counts.index, autopct='%1.0f%%',
            startangle=90, colors=sns.color_palette('Set3', len(type_counts)))
axes[1].set_title('Transaction Type Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../docs/sms_parser_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Visualization saved to docs/sms_parser_accuracy.png")

## 7. Validate Extracted Data Accuracy

In [None]:
# Compare parsed amounts with ground truth
print("\nüîç Validating Extracted Data:\n")

amount_matches = 0
reference_matches = 0
total_valid = 0

amount_errors = []

for idx, row in parsed_df.iterrows():
    if 'error' in row:
        continue
    
    total_valid += 1
    
    # Check amount accuracy
    parsed_amount = float(row['amount']) if 'amount' in row else None
    ground_truth_amount = row['ground_truth_amount']
    
    if parsed_amount is not None and abs(parsed_amount - ground_truth_amount) < 0.01:
        amount_matches += 1
    else:
        amount_errors.append({
            'index': idx,
            'parsed': parsed_amount,
            'expected': ground_truth_amount,
            'diff': abs(parsed_amount - ground_truth_amount) if parsed_amount else None
        })
    
    # Check reference accuracy
    parsed_ref = row.get('reference', '')
    ground_truth_ref = row['ground_truth_reference']
    
    if parsed_ref == ground_truth_ref:
        reference_matches += 1

print(f"Amount Extraction:")
print(f"  ‚úì Correct: {amount_matches}/{total_valid} ({amount_matches/total_valid*100:.1f}%)")

print(f"\nReference Extraction:")
print(f"  ‚úì Correct: {reference_matches}/{total_valid} ({reference_matches/total_valid*100:.1f}%)")

if amount_errors:
    print(f"\n‚ö†Ô∏è  Found {len(amount_errors)} amount mismatches:")
    for err in amount_errors[:5]:  # Show first 5 errors
        print(f"  SMS {err['index']}: Parsed={err['parsed']}, Expected={err['expected']}, Diff={err['diff']}")

## 8. Test Individual Parsers

In [None]:
# Test each transaction type with a sample
print("\nüß™ Testing Individual Transaction Types:\n")

for trans_type in sms_df['transaction_type'].unique():
    # Get a sample message of this type
    sample = sms_df[sms_df['transaction_type'] == trans_type].iloc[0]
    
    print(f"\n{'='*70}")
    print(f"Transaction Type: {trans_type.upper()}")
    print(f"{'='*70}")
    
    # Parse the message
    result = parser.parse_sms(sample['sms_text'])
    
    if result and 'error' not in result:
        print(f"‚úì Successfully Parsed")
        print(f"\nOriginal SMS (first 100 chars):")
        print(f"  {sample['sms_text'][:100]}...")
        print(f"\nExtracted Data:")
        print(f"  Reference: {result['reference']}")
        print(f"  Amount: KES {result['amount']:,.2f}")
        print(f"  Date: {result['date'].strftime('%d/%m/%Y %I:%M %p')}")
        print(f"  Balance: KES {result['balance']:,.2f}")
        
        # Show type-specific fields
        if 'sender' in result:
            print(f"  Sender: {result['sender']}")
        if 'recipient' in result:
            print(f"  Recipient: {result['recipient']}")
        if 'merchant' in result:
            print(f"  Merchant: {result['merchant']}")
        if 'bank' in result:
            print(f"  Bank: {result['bank']}")
        
        print(f"\nHuman-Readable Summary:")
        print(f"  {parser.get_transaction_summary(result)}")
        
        # Validate
        is_valid, errors = parser.validate_parsed_data(result)
        if is_valid:
            print(f"\n‚úì Validation: PASSED")
        else:
            print(f"\n‚úó Validation: FAILED")
            for error in errors:
                print(f"  - {error}")
    else:
        print(f"‚úó Failed to Parse")
        if result:
            print(f"  Error: {result.get('error', 'Unknown')}")

## 9. Generate Parsing Statistics

In [None]:
# Get overall statistics
stats = parser.get_statistics(parsed_results)

print("\nüìä Overall Parsing Statistics\n")
print("=" * 70)

print(f"\nTransaction Summary:")
print(f"  Total Messages: {stats['total_transactions']}")
print(f"  Successfully Parsed: {stats['successful_parses']} ({stats['successful_parses']/stats['total_transactions']*100:.1f}%)")
print(f"  Failed to Parse: {stats['failed_parses']} ({stats['failed_parses']/stats['total_transactions']*100:.1f}%)")

print(f"\nFinancial Summary:")
print(f"  Total Transaction Amount: KES {stats['total_amount']:,.2f}")

print(f"\nTransaction Type Breakdown:")
for trans_type, count in sorted(stats['transaction_type_counts'].items()):
    percentage = (count / stats['successful_parses'] * 100)
    print(f"  {trans_type:20s}: {count:3d} ({percentage:5.1f}%)")

print(f"\nDate Range:")
if stats['date_range']['earliest'] and stats['date_range']['latest']:
    print(f"  Earliest: {stats['date_range']['earliest'].strftime('%d/%m/%Y')}")
    print(f"  Latest: {stats['date_range']['latest'].strftime('%d/%m/%Y')}")
    days_span = (stats['date_range']['latest'] - stats['date_range']['earliest']).days
    print(f"  Span: {days_span} days")

print("\n" + "=" * 70)

## 10. Edge Case Analysis

In [None]:
# Analyze any parsing failures
print("\nüîç Edge Case Analysis\n")
print("=" * 70)

failed_parses = [r for r in parsed_results if 'error' in r]

if failed_parses:
    print(f"\nFound {len(failed_parses)} failed parse(s):\n")
    
    for fail in failed_parses:
        idx = fail['sms_index']
        print(f"\nSMS Index: {idx}")
        print(f"Error: {fail['error']}")
        print(f"Original Text: {fail['original_text']}")
        
        # Get ground truth
        gt_type = sms_df.iloc[idx]['transaction_type']
        print(f"Expected Type: {gt_type}")
else:
    print("\n‚úì No parsing failures! All messages parsed successfully.")

# Check for edge cases in successful parses
print("\n\nEdge Cases in Successful Parses:\n")

# Large amounts
large_amounts = [r for r in parsed_results if 'amount' in r and float(r['amount']) > 10000]
print(f"  Large Transactions (>10,000): {len(large_amounts)}")

# Small amounts
small_amounts = [r for r in parsed_results if 'amount' in r and float(r['amount']) < 100]
print(f"  Small Transactions (<100): {len(small_amounts)}")

# Negative balances
negative_balances = [r for r in parsed_results if 'balance' in r and float(r['balance']) < 0]
print(f"  Negative Balances: {len(negative_balances)}")

print("\n" + "=" * 70)

## 11. Performance Summary & Recommendations

In [None]:
print("\nüéØ SMS Parser Performance Summary\n")
print("=" * 70)

# Calculate overall metrics
overall_accuracy = (stats['successful_parses'] / stats['total_transactions'] * 100)

print(f"\n‚úÖ ACHIEVEMENTS:")
print(f"  ‚Ä¢ Parsing Accuracy: {overall_accuracy:.1f}%")
print(f"  ‚Ä¢ Amount Extraction: {amount_matches/total_valid*100:.1f}%")
print(f"  ‚Ä¢ Reference Extraction: {reference_matches/total_valid*100:.1f}%")
print(f"  ‚Ä¢ Transaction Types Supported: {len(parser.transaction_types)}")
print(f"  ‚Ä¢ Messages Processed: {stats['total_transactions']}")

# Performance rating
if overall_accuracy >= 95:
    rating = "EXCELLENT üåü"
elif overall_accuracy >= 85:
    rating = "GOOD ‚úì"
elif overall_accuracy >= 75:
    rating = "ACCEPTABLE ‚ö†Ô∏è"
else:
    rating = "NEEDS IMPROVEMENT ‚ö†Ô∏è"

print(f"\nüìä Overall Rating: {rating}")

# Recommendations
print(f"\nüí° RECOMMENDATIONS:")

if overall_accuracy < 100:
    print(f"  ‚Ä¢ Investigate failed parses to improve pattern matching")
    print(f"  ‚Ä¢ Add more test cases for edge scenarios")
else:
    print(f"  ‚Ä¢ Perfect parsing rate achieved!")
    print(f"  ‚Ä¢ Ready for production deployment")

if len(accuracy_by_type) < len(parser.transaction_types):
    missing_types = set(parser.transaction_types) - set(accuracy_by_type.keys())
    print(f"  ‚Ä¢ Generate test data for: {', '.join(missing_types)}")

print(f"  ‚Ä¢ Continue monitoring parsing accuracy with real data")
print(f"  ‚Ä¢ Consider adding support for additional bank formats")

print("\n" + "=" * 70)
print("\n‚úÖ Testing Complete!")

## 12. Export Parsed Results

In [None]:
# Export parsed results to CSV for further analysis
output_path = '../data/synthetic/sms_parsed_results.csv'

# Create a clean dataframe for export
export_data = []
for idx, result in enumerate(parsed_results):
    if 'error' in result:
        export_data.append({
            'sms_index': idx,
            'parsing_status': 'failed',
            'error': result['error']
        })
    else:
        export_data.append({
            'sms_index': idx,
            'parsing_status': 'success',
            'transaction_type': result.get('transaction_type'),
            'amount': float(result.get('amount', 0)),
            'reference': result.get('reference'),
            'date': result.get('date'),
            'balance': float(result.get('balance', 0)),
            'sender': result.get('sender', ''),
            'recipient': result.get('recipient', ''),
            'merchant': result.get('merchant', ''),
            'bank': result.get('bank', ''),
        })

export_df = pd.DataFrame(export_data)
export_df.to_csv(output_path, index=False)

print(f"\n‚úì Parsed results exported to: {output_path}")
print(f"  Total rows: {len(export_df)}")
print(f"  Columns: {list(export_df.columns)}")