# AI-Powered Financial Data Extraction - Demonstration

## Setup and Configuration

```python
# Import required libraries
import os
import pandas as pd
from sqlalchemy import create_engine, text
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('demo')

# Set your OpenAI API key (replace with actual key)
OPENAI_API_KEY = "sk-your-actual-key-here"

# Verify API key is set
if not OPENAI_API_KEY or OPENAI_API_KEY == "sk-your-actual-key-here":
    print("⚠️  Please set your OpenAI API key!")
    print("You can get one at: https://platform.openai.com/")
else:
    print("✅ OpenAI API key is configured")
```

## Demo 1: Extract from Text Document

```python
# Create a sample text document
sample_text = """
QUARTERLY INVESTMENT REPORT
Date: March 31, 2024

Portfolio Holdings:

1. Apple Inc. (AAPL)
   - Purchase Price: $45,000
   - Current Value: $52,300
   - Last Quarter: $48,900
   - Currency: USD
   - Sector: Technology
   - Risk: Moderate

2. Microsoft Corp (MSFT)
   - Purchase Price: $35,000
   - Current Value: $41,200
   - Last Quarter: $38,500
   - Currency: USD
   - Sector: Technology
   - Risk: Low

3. Treasury Bond 2030
   - Purchase Price: $100,000
   - Current Value: $98,500
   - Last Quarter: $99,000
   - Currency: USD
   - Sector: Government
   - Risk: Low
   - Maturity: 12/31/2030
   - Yield: 4.25%
"""

# Save to file
with open('demo_portfolio.txt', 'w') as f:
    f.write(sample_text)

print("✅ Created demo portfolio text file")
```

```python
# Run AI extraction
from ai_data_extractor import main_ai_extraction

logger.info("Starting AI extraction demo...")
success = main_ai_extraction('demo_portfolio.txt', OPENAI_API_KEY)

if success:
    print("✅ AI extraction completed successfully!")
else:
    print("❌ AI extraction failed")
```

## Demo 2: Verify Database Results

```python
# Connect to database and check results
CONNECTION_STRING = (
    f"postgresql://{CONFIG['database']['user']}:{CONFIG['database']['password']}"
    f"@{CONFIG['database']['host']}:{CONFIG['database']['port']}/{CONFIG['database']['database']}"
)

engine = create_engine(CONNECTION_STRING)

# Query extracted data
df = pd.read_sql("SELECT * FROM ai_financial_data", engine)
print("\n📊 Database Results:")
print(f"Total records: {len(df)}")
print("\nExtracted data:")
display(df)

# Query statistics
stats_df = pd.read_sql("SELECT * FROM ai_financial_data_stats", engine)
print("\n📈 Statistics:")
display(stats_df)
```

## Demo 3: Excel Output Review

```python
# Load and display Excel output
excel_data = pd.read_excel('ai_extracted_financial_data.xlsx', sheet_name='Extracted Data')
print("\n📄 Excel - Extracted Data:")
display(excel_data)

excel_stats = pd.read_excel('ai_extracted_financial_data.xlsx', sheet_name='Statistics')
print("\n📊 Excel - Statistics:")
display(excel_stats)
```

## Demo 4: Compare Different Document Formats

```python
# Test with CSV format
csv_data = """Security Name,Original Cost,Current Value,Previous Value,Currency
Apple Inc,45000,52300,48900,USD
Microsoft Corp,35000,41200,38500,USD
Treasury Bond 2030,100000,98500,99000,USD
"""

with open('demo_portfolio.csv', 'w') as f:
    f.write(csv_data)

# Test with JSON format
import json
json_data = {
    "report_date": "2024-03-31",
    "holdings": [
        {
            "name": "Apple Inc",
            "original_cost": 45000,
            "current_value": 52300,
            "previous_value": 48900,
            "currency": "USD"
        },
        {
            "name": "Microsoft Corp",
            "original_cost": 35000,
            "current_value": 41200,
            "previous_value": 38500,
            "currency": "USD"
        }
    ]
}

with open('demo_portfolio.json', 'w') as f:
    json.dump(json_data, f, indent=2)

print("✅ Created demo files in different formats")
```

```python
# Extract from different formats
formats = ['csv', 'json']
results = {}

for fmt in formats:
    file_path = f'demo_portfolio.{fmt}'
    print(f"\n🔄 Processing {fmt.upper()} file...")
    success = main_ai_extraction(file_path, OPENAI_API_KEY)
    results[fmt] = success
    print(f"{'✅' if success else '❌'} {fmt.upper()} processing {'completed' if success else 'failed'}")

print("\n📋 Summary of results:")
for fmt, success in results.items():
    print(f"  - {fmt.upper()}: {'Success' if success else 'Failed'}")
```

## Demo 5: Field Recognition Accuracy

```python
# Test with variations in field names
varied_text = """
Asset Portfolio Report
Reporting Date: 03/31/2024

Securities:

Stock: Apple Inc.
  • Acquisition Cost: $45,000
  • Market Value: $52,300
  • Previous Period: $48,900
  • Trading Currency: USD
  • Industry: Technology

Bond: US Treasury 2030
  • Initial Investment: $100,000
  • Fair Value: $98,500
  • Prior Quarter Value: $99,000
  • Denomination: USD
  • Category: Government
  • Maturity Date: December 31, 2030
  • Annual Yield: 4.25%
"""

with open('demo_varied_fields.txt', 'w') as f:
    f.write(varied_text)

print("✅ Created document with varied field names")

# Extract and analyze
success = main_ai_extraction('demo_varied_fields.txt', OPENAI_API_KEY)

if success:
    # Check how well the AI handled field variations
    df_varied = pd.read_sql("SELECT * FROM ai_financial_data", engine)
    print(f"\n📊 Extracted {len(df_varied)} records with varied field names")
    
    # Show field mapping success
    fields_found = df_varied.columns[df_varied.notna().any()].tolist()
    print(f"🎯 Fields successfully extracted: {fields_found}")
```

## Demo 6: Error Handling and Edge Cases

```python
# Test with incomplete data
incomplete_text = """
Investment Report

Security: Unknown Asset
Current Value: $50,000
Currency: [MISSING]
Sector: Technology

Security: Partial Data Asset
Original Cost: $25,000
Currency: USD
[Other fields missing]
"""

with open('demo_incomplete.txt', 'w') as f:
    f.write(incomplete_text)

print("✅ Created document with incomplete data")

# Extract and see how AI handles missing fields
success = main_ai_extraction('demo_incomplete.txt', OPENAI_API_KEY)

if success:
    df_incomplete = pd.read_sql("SELECT * FROM ai_financial_data", engine)
    print(f"\n📊 Extracted {len(df_incomplete)} records from incomplete data")
    
    # Analyze completeness
    completeness = (df_incomplete.notna().sum() / len(df_incomplete) * 100).round(2)
    print("\n🔍 Field completeness analysis:")
    for field, percent in completeness.items():
        print(f"  - {field}: {percent}%")
```

## Demo 7: Batch Processing

```python
# Create multiple sample files
sample_files = []

for i in range(3):
    content = f"""
Portfolio Batch {i+1}
Date: 2024-03-{(i+1)*10}

Investment A{i+1}: Value ${10000 + i*5000}
Investment B{i+1}: Value ${20000 + i*3000}
Currency: USD
"""
    filename = f'batch_demo_{i+1}.txt'
    with open(filename, 'w') as f:
        f.write(content)
    sample_files.append(filename)

print(f"✅ Created {len(sample_files)} files for batch processing")

# Process all files
batch_results = []
for file_path in sample_files:
    print(f"\n🔄 Processing {file_path}...")
    success = main_ai_extraction(file_path, OPENAI_API_KEY)
    batch_results.append((file_path, success))

print("\n📋 Batch processing summary:")
for file_path, success in batch_results:
    print(f"  - {file_path}: {'✅ Success' if success else '❌ Failed'}")
```

## Demo 8: Performance Analysis

```python
import time

# Measure extraction time
start_time = time.time()
success = main_ai_extraction('demo_portfolio.txt', OPENAI_API_KEY)
end_time = time.time()

processing_time = end_time - start_time
print(f"\n⏱️  Processing time: {processing_time:.2f} seconds")

# Compare with file size
import os
file_size = os.path.getsize('demo_portfolio.txt')
print(f"📄 File size: {file_size} bytes")
print(f"🚀 Processing speed: {file_size/processing_time:.2f} bytes/second")
```

## Demo 9: API Usage and Costs

```python
# Estimate API costs (approximate)
def estimate_costs(text_length: int, model: str = "gpt-4o"):
    # Rough token estimation (1 token ≈ 4 characters)
    estimated_tokens = text_length / 4
    
    # Pricing (as of 2024)
    costs = {
        "gpt-4o": 0.005,  # $0.005 per 1K tokens
        "gpt-3.5-turbo": 0.0015  # $0.0015 per 1K tokens
    }
    
    cost_per_1k = costs.get(model, costs["gpt-4o"])
    estimated_cost = (estimated_tokens / 1000) * cost_per_1k
    
    return estimated_tokens, estimated_cost

# Analyze costs for processed files
total_cost = 0
for file_path in ['demo_portfolio.txt', 'demo_portfolio.csv', 'demo_portfolio.json']:
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            content = f.read()
        
        tokens, cost = estimate_costs(len(content))
        total_cost += cost
        
        print(f"\n💰 {file_path}:")
        print(f"  - Estimated tokens: {tokens:.0f}")
        print(f"  - Estimated cost: ${cost:.4f}")

print(f"\n💳 Total estimated cost: ${total_cost:.4f}")
```

## Demo 10: Validation and Accuracy

```python
# Create a document with known expected results
validation_text = """
Validation Test Portfolio
Date: 2024-03-31

Test Asset 1:
- Security: Validation Stock A
- Original Investment: $10,000.00
- Current Value: $12,000.00
- Prior Value: $11,000.00
- Currency: USD
- Sector: Technology
- Risk: Low

Test Asset 2:
- Security: Validation Bond B
- Original Investment: $50,000.00
- Current Value: $48,500.00
- Prior Value: $49,000.00
- Currency: USD
- Sector: Government
- Risk: Very Low
- Maturity: 2030-12-31
- Yield: 3.5%
"""

with open('validation_test.txt', 'w') as f:
    f.write(validation_text)

# Define expected results
expected_results = [
    {
        'original_security_name': 'Validation Stock A',
        'investment_in_original': 10000.00,
        'investment_in': 12000.00,
        'investment_in_prior': 11000.00,
        'currency': 'USD'
    },
    {
        'original_security_name': 'Validation Bond B',
        'investment_in_original': 50000.00,
        'investment_in': 48500.00,
        'investment_in_prior': 49000.00,
        'currency': 'USD'
    }
]

# Extract and compare
success = main_ai_extraction('validation_test.txt', OPENAI_API_KEY)

if success:
    # Get actual results
    actual_df = pd.read_sql("SELECT * FROM ai_financial_data", engine)
    
    print("\n✅ Validation Results:")
    print(f"Expected {len(expected_results)} records, got {len(actual_df)}")
    
    # Compare key fields
    accuracy_metrics = {}
    key_fields = ['original_security_name', 'investment_in_original', 'investment_in', 'currency']
    
    for field in key_fields:
        correct_count = 0
        for i, expected in enumerate(expected_results):
            if i < len(actual_df) and field in actual_df.columns:
                expected_value = expected.get(field)
                actual_value = actual_df.iloc[i][field]
                
                if expected_value == actual_value:
                    correct_count += 1
        
        accuracy = (correct_count / len(expected_results)) * 100
        accuracy_metrics[field] = accuracy
        print(f"  - {field}: {accuracy:.1f}% accurate")
    
    overall_accuracy = sum(accuracy_metrics.values()) / len(accuracy_metrics)
    print(f"\n🎯 Overall accuracy: {overall_accuracy:.1f}%")
```

## Conclusion

This demonstration shows how the AI-powered solution:

1. ✅ Handles multiple document formats seamlessly
2. ✅ Recognizes field variations automatically
3. ✅ Extracts data with high accuracy
4. ✅ Handles incomplete data gracefully
5. ✅ Provides detailed statistics and validation
6. ✅ Offers cost-effective processing

The AI approach significantly reduces the maintenance overhead compared to regex-based solutions while providing better accuracy and flexibility for extracting financial data from diverse document formats.`m