In [1]:
import re
import os
import pandas as pd
import numpy as np
from datetime import datetime
import psycopg2  # For PostgreSQL
import pymysql   # For MySQL
import sqlalchemy
from sqlalchemy import create_engine
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
import docx2txt  # For .docx files
import PyPDF2    # For PDF files
import csv

In [3]:
# Configuration
CONFIG = {
    "database": {
        "type": "postgresql",  # Change to "mysql" if using MySQL
        "host": "localhost",
        "port": 5432,  # Change to 3306 for MySQL
        "database": "financial_data",
        "user": "postgres",
        "password": "SenkoSQL"
    },
    "extraction": {
        "mandatory_fields": [
            "as_of_date",
            "original_security_name",
            "investment_in_original",
            "investment_in",
            "investment_in_prior",
            "currency"
        ],
        "additional_fields": [
            "sector",
            "risk_rating",
            "maturity_date",
            "yield_percentage"
        ]
    },
    "output": {
        "excel_file": "extracted_financial_data.xlsx"
    }
}


class DocumentExtractor:
    """Handle extraction of data from various document types"""
    
    def __init__(self, file_path):
        self.file_path = file_path
        self.file_extension = os.path.splitext(file_path)[1].lower()
        self.raw_text = ""
        self.extracted_data = []
        
    def extract_text(self):
        """Extract raw text from document based on file extension"""
        if self.file_extension == ".docx":
            self.raw_text = docx2txt.process(self.file_path)
        elif self.file_extension == ".pdf":
            with open(self.file_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    self.raw_text += page.extract_text()
        elif self.file_extension == ".txt":
            with open(self.file_path, "r", encoding="utf-8") as file:
                self.raw_text = file.read()
        elif self.file_extension == ".csv":
            with open(self.file_path, 'r', encoding='utf-8') as file:
                reader = csv.reader(file)
                for row in reader:
                    self.raw_text += ' '.join(row) + '\n'
        else:
            raise ValueError(f"Unsupported file format: {self.file_extension}")
            
        return self.raw_text
    
    def extract_data(self):
        """Extract structured data from the text"""
        # Make sure we have text to process
        if not self.raw_text:
            self.extract_text()
            
        # Initialize data dictionaries list
        data_dicts = []
        
        # Global date - looks for the as_of_date field in the whole document
        as_of_date = None
        date_match = re.search(r"As of date:\s+([0-9]{1,2}[\/\-\.][0-9]{1,2}[\/\-\.][0-9]{2,4})", self.raw_text)
        if date_match:
            as_of_date = date_match.group(1).strip()
        
        # Patterns for extraction
        patterns = {
            "original_security_name": r"Original security name:\s+([A-Za-z0-9\s\.\,\&\-]+)",
            "investment_in_original": r"Investment in \(original\):\s+([0-9,\.]+)",
            "investment_in": r"Investment in:\s+([0-9,\.]+)",
            "investment_in_prior": r"Investment in \(prior\):\s+([0-9,\.]+)",
            "currency": r"Currency:\s+([A-Z]{3})",
            "sector": r"Sector:\s+([A-Za-z\s\&]+)",
            "risk_rating": r"Risk rating:\s+([A-Za-z0-9\-]+)",
            "maturity_date": r"Maturity date:\s+([0-9]{1,2}[\/\-\.][0-9]{1,2}[\/\-\.][0-9]{2,4}|N\/A)",
            "yield_percentage": r"Yield percentage:\s+([0-9\.]+)[%]?"
        }
        
        # Find investment sections
        investment_sections = re.findall(r"#### INVESTMENT \d+\n(.*?)(?=#### INVESTMENT \d+|\Z)", 
                                       self.raw_text, re.DOTALL)
        
        # Process each investment section
        for section in investment_sections:
            data_dict = {}
            
            # Add the global date to each investment record
            if as_of_date:
                data_dict["as_of_date"] = as_of_date
            
            # Extract each field from the section
            for field, pattern in patterns.items():
                match = re.search(pattern, section)
                if match:
                    data_dict[field] = match.group(1).strip()
                    
            # Only add if we found at least one field
            if data_dict:
                data_dicts.append(data_dict)
        
        # If no investment sections were found, try a more general approach
        if not data_dicts:
            # Create a record for the as_of_date if found
            if as_of_date:
                data_dicts.append({"as_of_date": as_of_date})
            
            # Try to find individual fields across the whole document
            for field, pattern in patterns.items():
                matches = re.findall(pattern, self.raw_text)
                for i, match in enumerate(matches):
                    # Create new dictionaries as needed
                    while i >= len(data_dicts):
                        data_dicts.append({})
                    data_dicts[i][field] = match.strip()
        
        self.extracted_data = data_dicts
        return data_dicts


class DataProcessor:
    """Process and format the extracted data"""
    
    def __init__(self, data_list):
        self.raw_data = data_list
        self.processed_data = []
        self.extraction_stats = {}
        
    def format_data(self):
        """Format all data according to US standards"""
        for item in self.raw_data:
            processed_item = {}
            
            # Process date fields (MM/DD/YYYY)
            for field in ['as_of_date', 'maturity_date']:
                if field in item:
                    try:
                        # Try different date formats
                        for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%m/%d/%Y', '%Y-%m-%d']:
                            try:
                                date_obj = datetime.strptime(item[field], fmt)
                                processed_item[field] = date_obj.strftime('%m/%d/%Y')
                                break
                            except ValueError:
                                continue
                    except:
                        # If we can't parse it, keep the original
                        processed_item[field] = item[field]
            
            # Process currency fields (USD format with 2 decimal places)
            for field in ['investment_in_original', 'investment_in', 'investment_in_prior']:
                if field in item:
                    try:
                        # Remove any non-numeric characters except decimal point
                        value = re.sub(r'[^\d.]', '', item[field])
                        # Format as currency with 2 decimal places
                        processed_item[field] = "{:.2f}".format(float(value))
                    except:
                        processed_item[field] = item[field]
            
            # Process yield percentage
            if 'yield_percentage' in item:
                try:
                    value = re.sub(r'[^\d.]', '', item['yield_percentage'])
                    processed_item['yield_percentage'] = "{:.2f}".format(float(value))
                except:
                    processed_item['yield_percentage'] = item['yield_percentage']
            
            # Copy other fields as is
            for field in ['original_security_name', 'currency', 'sector', 'risk_rating']:
                if field in item:
                    processed_item[field] = item[field]
            
            self.processed_data.append(processed_item)
        
        return self.processed_data
    
    def calculate_statistics(self):
        """Calculate extraction statistics"""
        total_records = len(self.processed_data)
        if total_records == 0:
            self.extraction_stats = {
                "total_records": 0,
                "mandatory_fields_extracted": 0,
                "mandatory_fields_percentage": 0,
                "extraction_accuracy": 0,
                "missing_fields": CONFIG["extraction"]["mandatory_fields"],
                "inconsistent_data": []
            }
            return self.extraction_stats
        
        # Count mandatory fields
        mandatory_fields = CONFIG["extraction"]["mandatory_fields"]
        mandatory_field_counts = {field: 0 for field in mandatory_fields}
        
        for record in self.processed_data:
            for field in mandatory_fields:
                if field in record and record[field]:
                    mandatory_field_counts[field] += 1
        
        # Calculate percentages
        total_mandatory_fields = len(mandatory_fields) * total_records
        extracted_mandatory_fields = sum(mandatory_field_counts.values())
        
        mandatory_fields_percentage = (extracted_mandatory_fields / total_mandatory_fields) * 100 if total_mandatory_fields > 0 else 0
        
        # Identify missing and inconsistent data
        missing_fields = []
        for field, count in mandatory_field_counts.items():
            if count < total_records:
                missing_fields.append(f"{field} ({total_records - count} missing)")
        
        # Check for inconsistent data
        inconsistent_data = []
        
        # Example check: Verify if currencies are consistent
        currencies = set()
        for record in self.processed_data:
            if 'currency' in record and record['currency']:
                currencies.add(record['currency'])
        
        if len(currencies) > 1:
            inconsistent_data.append(f"Multiple currencies detected: {', '.join(currencies)}")
        
        # Calculate overall extraction accuracy (simplified)
        extraction_accuracy = mandatory_fields_percentage  # This can be refined with additional logic
        
        self.extraction_stats = {
            "total_records": total_records,
            "mandatory_fields_extracted": extracted_mandatory_fields,
            "mandatory_fields_percentage": mandatory_fields_percentage,
            "extraction_accuracy": extraction_accuracy,
            "missing_fields": missing_fields,
            "inconsistent_data": inconsistent_data
        }
        
        return self.extraction_stats


class DataStorage:
    """Store processed data in database and Excel file"""
    
    def __init__(self, processed_data, stats):
        self.data = processed_data
        self.stats = stats
        self.db_config = CONFIG["database"]
        self.excel_file = CONFIG["output"]["excel_file"]
        
    def create_dataframe(self):
        """Convert processed data to pandas DataFrame"""
        return pd.DataFrame(self.data)
    
    def store_in_database(self):
        """Store data in SQL database"""
        try:
            # Create database connection
            if self.db_config["type"] == "postgresql":
                connection_string = f"postgresql://{self.db_config['user']}:{self.db_config['password']}@{self.db_config['host']}:{self.db_config['port']}/{self.db_config['database']}"
            else:  # MySQL
                connection_string = f"mysql+pymysql://{self.db_config['user']}:{self.db_config['password']}@{self.db_config['host']}:{self.db_config['port']}/{self.db_config['database']}"
            
            engine = create_engine(connection_string)
            
            # Convert data to DataFrame
            df = self.create_dataframe()
            
            # Store in database
            df.to_sql('financial_data', engine, if_exists='replace', index=False)
            
            return True
        except Exception as e:
            print(f"Database storage error: {e}")
            return False
    
    def store_in_excel(self):
        """Store data in Excel file with two sheets"""
        try:
            # Create a Pandas Excel writer
            writer = pd.ExcelWriter(self.excel_file, engine='openpyxl')
            
            # Convert data to DataFrame
            df = self.create_dataframe()
            
            # Write data to "Extracted Data" sheet
            df.to_excel(writer, sheet_name='Extracted Data', index=False)
            
            # Create statistics DataFrame
            stats_data = {
                "Metric": [
                    "Total Records Processed",
                    "Mandatory Fields Extracted",
                    "Mandatory Fields Percentage",
                    "Extraction Accuracy",
                    "Missing Fields",
                    "Inconsistent Data"
                ],
                "Value": [
                    self.stats["total_records"],
                    self.stats["mandatory_fields_extracted"],
                    f"{self.stats['mandatory_fields_percentage']:.2f}%",
                    f"{self.stats['extraction_accuracy']:.2f}%",
                    ", ".join(self.stats["missing_fields"]) if self.stats["missing_fields"] else "None",
                    ", ".join(self.stats["inconsistent_data"]) if self.stats["inconsistent_data"] else "None"
                ]
            }
            
            stats_df = pd.DataFrame(stats_data)
            
            # Write statistics to "Statistics" sheet
            stats_df.to_excel(writer, sheet_name='Statistics', index=False)
            
            # Save the Excel file
            writer.close()
            
            return True
        except Exception as e:
            print(f"Excel storage error: {e}")
            return False


def main(file_path):
    """Main function to orchestrate the extraction, processing and storage"""
    # Step 1: Extract data from document
    print(f"Extracting data from {file_path}...")
    extractor = DocumentExtractor(file_path)
    raw_data = extractor.extract_data()
    
    if not raw_data:
        print("No data was extracted from the document.")
        return False
    
    print(f"Extracted {len(raw_data)} records.")
    
    # Step 2: Process and format data
    print("Processing and formatting data...")
    processor = DataProcessor(raw_data)
    processed_data = processor.format_data()
    stats = processor.calculate_statistics()
    
    # Step 3: Store data
    print("Storing data...")
    storage = DataStorage(processed_data, stats)
    
    # Attempt to store in database
    db_result = storage.store_in_database()
    if db_result:
        print(f"Data successfully stored in {CONFIG['database']['type']} database.")
    else:
        print(f"Failed to store data in database. Check your database connection settings.")
    
    # Store in Excel
    excel_result = storage.store_in_excel()
    if excel_result:
        print(f"Data successfully stored in Excel file: {CONFIG['output']['excel_file']}")
    else:
        print("Failed to store data in Excel file.")
    
    # Step 4: Print statistics
    print("\nExtraction Statistics:")
    print(f"Total Records: {stats['total_records']}")
    print(f"Mandatory Fields Extracted: {stats['mandatory_fields_extracted']}")
    print(f"Mandatory Fields Percentage: {stats['mandatory_fields_percentage']:.2f}%")
    print(f"Extraction Accuracy: {stats['extraction_accuracy']:.2f}%")
    
    if stats['missing_fields']:
        print(f"Missing Fields: {', '.join(stats['missing_fields'])}")
    else:
        print("Missing Fields: None")
    
    if stats['inconsistent_data']:
        print(f"Inconsistent Data: {', '.join(stats['inconsistent_data'])}")
    else:
        print("Inconsistent Data: None")
    
    return True

# IMPORTANT: Don't run this part in the cell
# if __name__ == "__main__":
#     file_path = "example_financial_document.docx"
#     main(file_path)

In [5]:
def create_sample_document():
    """
    Create a sample financial document for testing.
    Returns the path to the created document.
    """
    sample_text = """# QUARTERLY INVESTMENT REPORT
## Confidential Financial Document

As of date: 03/31/2024

### DETAILED INVESTMENTS

#### INVESTMENT 1
Original security name: Global Technology Fund Class A
Investment in (original): 400000.00
Investment in: 475250.00
Investment in (prior): 425800.00
Currency: USD
Sector: Technology
Risk rating: Moderate
Maturity date: N/A
Yield percentage: 2.45%

#### INVESTMENT 2
Original security name: Emerging Markets ETF
Investment in (original): 200000.00
Investment in: 180500.75
Investment in (prior): 194325.00
Currency: USD
Sector: International
Risk rating: High
Maturity date: N/A
Yield percentage: 3.85%

#### INVESTMENT 3
Original security name: US Treasury Bond 2026
Investment in (original): 250000.00
Investment in: 250000.00
Investment in (prior): 250000.00
Currency: USD
Sector: Government
Risk rating: Low
Maturity date: 06/15/2026
Yield percentage: 4.25%
"""
    
    # Create a sample text file
    sample_file_path = "sample_financial_document.txt"
    with open(sample_file_path, 'w') as f:
        f.write(sample_text)
    
    return sample_file_path

# Create the sample document
sample_path = create_sample_document()
print(f"Created sample document: {sample_path}")

Created sample document: sample_financial_document.txt


In [7]:
# Test extraction with the sample document
extractor = DocumentExtractor(sample_path)
extracted_data = extractor.extract_data()

print("\n--- Extracted Raw Data ---")
for i, item in enumerate(extracted_data):
    print(f"\nRecord {i+1}:")
    for key, value in item.items():
        print(f"  {key}: {value}")


--- Extracted Raw Data ---

Record 1:
  as_of_date: 03/31/2024
  original_security_name: Global Technology Fund Class A
Investment in
  investment_in_original: 400000.00
  investment_in: 475250.00
  investment_in_prior: 425800.00
  currency: USD
  sector: Technology
Risk rating
  risk_rating: Moderate
  maturity_date: N/A
  yield_percentage: 2.45

Record 2:
  as_of_date: 03/31/2024
  original_security_name: Emerging Markets ETF
Investment in
  investment_in_original: 200000.00
  investment_in: 180500.75
  investment_in_prior: 194325.00
  currency: USD
  sector: International
Risk rating
  risk_rating: High
  maturity_date: N/A
  yield_percentage: 3.85

Record 3:
  as_of_date: 03/31/2024
  original_security_name: US Treasury Bond 2026
Investment in
  investment_in_original: 250000.00
  investment_in: 250000.00
  investment_in_prior: 250000.00
  currency: USD
  sector: Government
Risk rating
  risk_rating: Low
  maturity_date: 06/15/2026
  yield_percentage: 4.25


In [9]:
# Process the extracted data
processor = DataProcessor(extracted_data)
processed_data = processor.format_data()
stats = processor.calculate_statistics()

print("\n--- Processed Data ---")
for i, item in enumerate(processed_data):
    print(f"\nRecord {i+1}:")
    for key, value in item.items():
        print(f"  {key}: {value}")

print("\n--- Extraction Statistics ---")
print(f"Total Records: {stats['total_records']}")
print(f"Mandatory Fields Extracted: {stats['mandatory_fields_extracted']}")
print(f"Mandatory Fields Percentage: {stats['mandatory_fields_percentage']:.2f}%")
print(f"Extraction Accuracy: {stats['extraction_accuracy']:.2f}%")

if stats['missing_fields']:
    print(f"Missing Fields: {', '.join(stats['missing_fields'])}")
else:
    print("Missing Fields: None")

if stats['inconsistent_data']:
    print(f"Inconsistent Data: {', '.join(stats['inconsistent_data'])}")
else:
    print("Inconsistent Data: None")


--- Processed Data ---

Record 1:
  as_of_date: 03/31/2024
  investment_in_original: 400000.00
  investment_in: 475250.00
  investment_in_prior: 425800.00
  yield_percentage: 2.45
  original_security_name: Global Technology Fund Class A
Investment in
  currency: USD
  sector: Technology
Risk rating
  risk_rating: Moderate

Record 2:
  as_of_date: 03/31/2024
  investment_in_original: 200000.00
  investment_in: 180500.75
  investment_in_prior: 194325.00
  yield_percentage: 3.85
  original_security_name: Emerging Markets ETF
Investment in
  currency: USD
  sector: International
Risk rating
  risk_rating: High

Record 3:
  as_of_date: 03/31/2024
  maturity_date: 06/15/2026
  investment_in_original: 250000.00
  investment_in: 250000.00
  investment_in_prior: 250000.00
  yield_percentage: 4.25
  original_security_name: US Treasury Bond 2026
Investment in
  currency: USD
  sector: Government
Risk rating
  risk_rating: Low

--- Extraction Statistics ---
Total Records: 3
Mandatory Fields Extra

In [11]:
# Store data in Excel
storage = DataStorage(processed_data, stats)
excel_result = storage.store_in_excel()

if excel_result:
    print(f"\nData successfully stored in Excel file: {CONFIG['output']['excel_file']}")
    # Read and display Excel file contents to verify
    try:
        df = pd.read_excel(CONFIG['output']['excel_file'], sheet_name='Extracted Data')
        print("\n--- Excel Data Preview ---")
        display(df)  # Jupyter will nicely format this
        
        stats_df = pd.read_excel(CONFIG['output']['excel_file'], sheet_name='Statistics')
        print("\n--- Excel Statistics Preview ---")
        display(stats_df)  # Jupyter will nicely format this
    except Exception as e:
        print(f"Error reading Excel file: {e}")
else:
    print("Failed to store data in Excel file.")


Data successfully stored in Excel file: extracted_financial_data.xlsx

--- Excel Data Preview ---


Unnamed: 0,as_of_date,investment_in_original,investment_in,investment_in_prior,yield_percentage,original_security_name,currency,sector,risk_rating,maturity_date
0,03/31/2024,400000.0,475250.0,425800.0,2.45,Global Technology Fund Class A\nInvestment in,USD,Technology\nRisk rating,Moderate,
1,03/31/2024,200000.0,180500.75,194325.0,3.85,Emerging Markets ETF\nInvestment in,USD,International\nRisk rating,High,
2,03/31/2024,250000.0,250000.0,250000.0,4.25,US Treasury Bond 2026\nInvestment in,USD,Government\nRisk rating,Low,06/15/2026



--- Excel Statistics Preview ---


Unnamed: 0,Metric,Value
0,Total Records Processed,3
1,Mandatory Fields Extracted,18
2,Mandatory Fields Percentage,100.00%
3,Extraction Accuracy,100.00%
4,Missing Fields,
5,Inconsistent Data,


In [13]:
# Fix for dropping the view
try:
    connection_string = f"postgresql://{CONFIG['database']['user']}:{CONFIG['database']['password']}@{CONFIG['database']['host']}:{CONFIG['database']['port']}/{CONFIG['database']['database']}"
    engine = create_engine(connection_string)
    
    # Use text() to make the SQL executable
    from sqlalchemy import text
    with engine.connect() as connection:
        connection.execute(text("DROP VIEW IF EXISTS financial_data_stats CASCADE;"))
        connection.commit()
    
    print("Successfully dropped the view (if it existed).")
except Exception as e:
    print(f"Error dropping view: {e}")

Successfully dropped the view (if it existed).


In [15]:
# Store data in database and verify
try:
    # Step 1: Store the data
    db_result = storage.store_in_database()
    if db_result:
        print(f"Data successfully stored in {CONFIG['database']['type']} database.")
        
        connection_string = f"postgresql://{CONFIG['database']['user']}:{CONFIG['database']['password']}@{CONFIG['database']['host']}:{CONFIG['database']['port']}/{CONFIG['database']['database']}"
        engine = create_engine(connection_string)
        
        # Step 2: Verify the data
        query = "SELECT * FROM financial_data"
        db_data = pd.read_sql(query, engine)
        
        print("\n--- Database Data Preview ---")
        display(db_data)
        
        # Step 3: Recreate the view
        view_sql = """
        CREATE OR REPLACE VIEW financial_data_stats AS
        SELECT
            COUNT(*) AS total_records,
            SUM(CASE WHEN as_of_date IS NOT NULL THEN 1 ELSE 0 END) AS as_of_date_count,
            SUM(CASE WHEN original_security_name IS NOT NULL THEN 1 ELSE 0 END) AS security_name_count,
            SUM(CASE WHEN investment_in_original IS NOT NULL THEN 1 ELSE 0 END) AS investment_original_count,
            SUM(CASE WHEN investment_in IS NOT NULL THEN 1 ELSE 0 END) AS investment_current_count,
            SUM(CASE WHEN investment_in_prior IS NOT NULL THEN 1 ELSE 0 END) AS investment_prior_count,
            SUM(CASE WHEN currency IS NOT NULL THEN 1 ELSE 0 END) AS currency_count,
            COUNT(DISTINCT currency) AS currency_count_distinct
        FROM financial_data;
        """
        
        from sqlalchemy import text
        with engine.connect() as connection:
            connection.execute(text(view_sql))
            connection.commit()
        
        print("\nSuccessfully created financial_data_stats view.")
        
        # Query the view to verify
        query = "SELECT * FROM financial_data_stats"
        stats_data = pd.read_sql(query, engine)
        
        print("\n--- Database Stats View Preview ---")
        display(stats_data)
    else:
        print(f"Failed to store data in database.")
except Exception as e:
    print(f"Database operation error: {e}")

Data successfully stored in postgresql database.

--- Database Data Preview ---


Unnamed: 0,as_of_date,investment_in_original,investment_in,investment_in_prior,yield_percentage,original_security_name,currency,sector,risk_rating,maturity_date
0,03/31/2024,400000.0,475250.0,425800.0,2.45,Global Technology Fund Class A\nInvestment in,USD,Technology\nRisk rating,Moderate,
1,03/31/2024,200000.0,180500.75,194325.0,3.85,Emerging Markets ETF\nInvestment in,USD,International\nRisk rating,High,
2,03/31/2024,250000.0,250000.0,250000.0,4.25,US Treasury Bond 2026\nInvestment in,USD,Government\nRisk rating,Low,06/15/2026



Successfully created financial_data_stats view.

--- Database Stats View Preview ---


Unnamed: 0,total_records,as_of_date_count,security_name_count,investment_original_count,investment_current_count,investment_prior_count,currency_count,currency_count_distinct
0,3,3,3,3,3,3,3,1


In [17]:
def create_standard_document():
    """
    Create a standard financial document that matches a typical format.
    All mandatory fields are clearly presented.
    """
    sample_text = """# INVESTMENT PORTFOLIO REPORT
## FINANCIAL STATEMENT

As of date: 04/15/2025

### HOLDINGS BREAKDOWN

#### INVESTMENT 1
Original security name: Global Fixed Income Fund
Investment in (original): 500000.00
Investment in: 522500.00
Investment in (prior): 512300.00
Currency: USD
Sector: Fixed Income
Risk rating: Low
Maturity date: N/A
Yield percentage: 3.75%

#### INVESTMENT 2
Original security name: Tech Sector ETF
Investment in (original): 350000.00
Investment in: 418500.00
Investment in (prior): 386500.00
Currency: USD
Sector: Technology
Risk rating: High
Maturity date: N/A
Yield percentage: 1.25%

#### INVESTMENT 3
Original security name: European Government Bond
Investment in (original): 275000.00
Investment in: 279000.00
Investment in (prior): 276250.00
Currency: EUR
Sector: Government
Risk rating: Low
Maturity date: 03/15/2027
Yield percentage: 2.85%

#### INVESTMENT 4
Original security name: Healthcare Innovation Fund
Investment in (original): 225000.00
Investment in: 243000.00
Investment in (prior): 234000.00
Currency: USD
Sector: Healthcare
Risk rating: Medium
Maturity date: N/A
Yield percentage: 1.65%
"""
    
    file_path = "standard_financial_document.txt"
    with open(file_path, 'w') as f:
        f.write(sample_text)
    
    return file_path

In [19]:
def create_complex_document():
    """
    Create a more complex financial document with different formatting
    and slight variations in field names to test the robustness of extraction.
    """
    sample_text = """QUARTERLY FINANCIAL HOLDINGS REPORT
CONFIDENTIAL - FOR AUTHORIZED PERSONNEL ONLY

Date of Valuation: 04/15/2025

INVESTMENT DETAILS:

Investment ID: INV-001
Security: Global Fixed Income Fund
Original Investment Amount: 500,000.00
Current Value: 522,500.00
Previous Quarter Value: 512,300.00
Currency Code: USD
Investment Category: Fixed Income
Risk Level: Low
Term: Open-ended
Annual Yield: 3.75%

----------------------------

Investment ID: INV-002
Security: Tech Sector ETF
Original Investment Amount: 350,000.00
Current Value: 418,500.00
Previous Quarter Value: 386,500.00
Currency Code: USD
Investment Category: Technology
Risk Level: High
Term: Open-ended
Annual Yield: 1.25%

----------------------------

Investment ID: INV-003
Security: European Government Bond
Original Investment Amount: 275,000.00
Current Value: 279,000.00
Previous Quarter Value: 276,250.00
Currency Code: EUR
Investment Category: Government
Risk Level: Low
Term: Matures 03/15/2027
Annual Yield: 2.85%

----------------------------

Investment ID: INV-004
Security: Healthcare Innovation Fund
Original Investment Amount: 225,000.00
Current Value: 243,000.00
Previous Quarter Value: 234,000.00
Currency Code: USD
Investment Category: Healthcare
Risk Level: Medium
Term: Open-ended
Annual Yield: 1.65%

Report generated on 04/15/2025
"""
    
    file_path = "complex_financial_document.txt"
    with open(file_path, 'w') as f:
        f.write(sample_text)
    
    return file_path

In [21]:
def create_challenging_document():
    """
    Create a challenging document with missing fields, inconsistent formatting,
    and other issues that might occur in real-world documents.
    """
    sample_text = """PORTFOLIO VALUATION SUMMARY
Report Generation: April 15, 2025

ASSET BREAKDOWN:

BOND: US Treasury 10-Year
Initial purchase: $300,000
Current market value: $310,500
Last quarter value: $305,250
Denominated in: USD
Classification: Government Bond
Risk profile: Very Low
Maturity: 2035-06-15
Coupon rate: 4.25%

EQUITY FUND: Emerging Markets Opportunity
Acquisition cost: 250000.00 USD
Present value: 230000.00 USD
Value as of Q4 2024: 240000.00 USD
Sector: International Equities
Volatility: High
Expected annual return: 8.5%

MIXED ASSET:
Name: Balanced Growth Portfolio
Original cost basis: 425000
Valuation: 450000
Prior valuation: 437500
Currency: USD
Composition: 60% Equity, 40% Fixed Income
Risk: Medium
Dividend yield: 3.2%

ALTERNATIVE:
European Infrastructure Fund
Cost: 175,000 EUR
Market value: 182,000 EUR
Q1 value: 178,500 EUR
Type: Infrastructure
Risk assessment: Medium-Low
Distribution yield: 5.1%

NOTE: All valuations effective as of 04/15/2025 unless otherwise stated.
"""
    
    file_path = "challenging_financial_document.txt"
    with open(file_path, 'w') as f:
        f.write(sample_text)
    
    return file_path

In [23]:
def test_extraction_system():
    """Test the extraction system against different document formats"""
    
    # Create all test documents
    standard_path = create_standard_document()
    complex_path = create_complex_document()
    challenging_path = create_challenging_document()
    
    documents = [
        {"name": "Standard Document", "path": standard_path},
        {"name": "Complex Document", "path": complex_path},
        {"name": "Challenging Document", "path": challenging_path}
    ]
    
    # Test each document
    for doc in documents:
        print(f"\n\n{'='*50}")
        print(f"TESTING: {doc['name']}")
        print(f"{'='*50}\n")
        
        # Extract data
        extractor = DocumentExtractor(doc['path'])
        extracted_data = extractor.extract_data()
        
        # Process data
        processor = DataProcessor(extracted_data)
        processed_data = processor.format_data()
        stats = processor.calculate_statistics()
        
        # Display results
        print(f"--- Extraction Results for {doc['name']} ---")
        print(f"Total Records: {stats['total_records']}")
        print(f"Mandatory Fields Extracted: {stats['mandatory_fields_extracted']}")
        print(f"Mandatory Fields Percentage: {stats['mandatory_fields_percentage']:.2f}%")
        print(f"Extraction Accuracy: {stats['extraction_accuracy']:.2f}%")
        
        if stats['missing_fields']:
            print(f"Missing Fields: {', '.join(stats['missing_fields'])}")
        else:
            print("Missing Fields: None")
        
        if stats['inconsistent_data']:
            print(f"Inconsistent Data: {', '.join(stats['inconsistent_data'])}")
        else:
            print("Inconsistent Data: None")
        
        # Store in Excel for this document
        storage = DataStorage(processed_data, stats)
        excel_result = storage.store_in_excel()
        
        if excel_result:
            excel_name = f"{doc['name'].lower().replace(' ', '_')}_results.xlsx"
            # Rename the Excel file to avoid overwriting
            import os
            os.rename(CONFIG['output']['excel_file'], excel_name)
            print(f"Results saved to Excel file: {excel_name}")

# Run the test
test_extraction_system()



TESTING: Standard Document

--- Extraction Results for Standard Document ---
Total Records: 4
Mandatory Fields Extracted: 24
Mandatory Fields Percentage: 100.00%
Extraction Accuracy: 100.00%
Missing Fields: None
Inconsistent Data: Multiple currencies detected: EUR, USD
Results saved to Excel file: standard_document_results.xlsx


TESTING: Complex Document

--- Extraction Results for Complex Document ---
Total Records: 0
Mandatory Fields Extracted: 0
Mandatory Fields Percentage: 0.00%
Extraction Accuracy: 0.00%
Missing Fields: as_of_date, original_security_name, investment_in_original, investment_in, investment_in_prior, currency
Inconsistent Data: None
Results saved to Excel file: complex_document_results.xlsx


TESTING: Challenging Document

--- Extraction Results for Challenging Document ---
Total Records: 1
Mandatory Fields Extracted: 1
Mandatory Fields Percentage: 16.67%
Extraction Accuracy: 16.67%
Missing Fields: as_of_date (1 missing), original_security_name (1 missing), invest