In [6]:
import re
import os
import pandas as pd
import numpy as np
from datetime import datetime
import psycopg2  # For PostgreSQL
import pymysql   # For MySQL
import sqlalchemy
from sqlalchemy import create_engine
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
import docx2txt  # For .docx files
import PyPDF2    # For PDF files
import csv

In [8]:
# Configuration
CONFIG = {
    "database": {
        "type": "postgresql",  # Change to "mysql" if using MySQL
        "host": "localhost",
        "port": 5432,  # Change to 3306 for MySQL
        "database": "financial_data",
        "user": "postgres",
        "password": "SenkoSQL"
    },
    "extraction": {
        "mandatory_fields": [
            "as_of_date",
            "original_security_name",
            "investment_in_original",
            "investment_in",
            "investment_in_prior",
            "currency"
        ],
        "additional_fields": [
            "sector",
            "risk_rating",
            "maturity_date",
            "yield_percentage"
        ]
    },
    "output": {
        "excel_file": "extracted_financial_data.xlsx"
    }
}


class DocumentExtractor:
    """Handle extraction of data from various document types"""
    
    def __init__(self, file_path):
        self.file_path = file_path
        self.file_extension = os.path.splitext(file_path)[1].lower()
        self.raw_text = ""
        self.extracted_data = []
        
    def extract_text(self):
        """Extract raw text from document based on file extension"""
        if self.file_extension == ".docx":
            self.raw_text = docx2txt.process(self.file_path)
        elif self.file_extension == ".pdf":
            with open(self.file_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    self.raw_text += page.extract_text()
        elif self.file_extension == ".txt":
            with open(self.file_path, "r", encoding="utf-8") as file:
                self.raw_text = file.read()
        elif self.file_extension == ".csv":
            with open(self.file_path, 'r', encoding='utf-8') as file:
                reader = csv.reader(file)
                for row in reader:
                    self.raw_text += ' '.join(row) + '\n'
        else:
            raise ValueError(f"Unsupported file format: {self.file_extension}")
            
        return self.raw_text
    
    def extract_data(self):
        """Extract structured data from the text"""
        # Make sure we have text to process
        if not self.raw_text:
            self.extract_text()
            
        # Initialize data dictionaries list
        data_dicts = []
        
        # Patterns for extraction - these will need to be adjusted based on the actual document format
        patterns = {
            "as_of_date": r"(?:As of date|Date)[:\s]+([0-9]{1,2}[\/\-\.][0-9]{1,2}[\/\-\.][0-9]{2,4})",
            "original_security_name": r"(?:Original security name|Security)[:\s]+([A-Za-z0-9\s\.\,\&\-]+)",
            "investment_in_original": r"(?:Investment in \(original\)|Original investment)[:\s]+([0-9,\.]+)",
            "investment_in": r"(?:Investment in(?!\s+\()|Current investment)[:\s]+([0-9,\.]+)",
            "investment_in_prior": r"(?:Investment in \(prior\)|Prior investment)[:\s]+([0-9,\.]+)",
            "currency": r"(?:Currency)[:\s]+([A-Z]{3})",
            # Additional fields (optional)
            "sector": r"(?:Sector|Industry)[:\s]+([A-Za-z\s\&]+)",
            "risk_rating": r"(?:Risk rating|Risk)[:\s]+([A-Za-z0-9\-]+)",
            "maturity_date": r"(?:Maturity date|Maturity)[:\s]+([0-9]{1,2}[\/\-\.][0-9]{1,2}[\/\-\.][0-9]{2,4})",
            "yield_percentage": r"(?:Yield|Yield %)[:\s]+([0-9\.]+)[%]?"
        }
        
        # This is a simplified approach - for real documents, you might need 
        # more sophisticated parsing based on document structure
        
        # Approach 1: Try to find individual entries (assuming one entry per line or section)
        sections = re.split(r'\n\s*\n', self.raw_text)
        
        for section in sections:
            if len(section.strip()) < 10:  # Skip very short sections
                continue
                
            data_dict = {}
            
            # Extract each field from the section
            for field, pattern in patterns.items():
                match = re.search(pattern, section)
                if match:
                    data_dict[field] = match.group(1).strip()
                    
            # Only add if we found at least one field
            if data_dict:
                # Check if this could be a valid entry
                if any(key in data_dict for key in CONFIG["extraction"]["mandatory_fields"]):
                    data_dicts.append(data_dict)
        
        # If no data found with the section approach, try whole document approach
        if not data_dicts:
            data_dict = {}
            for field, pattern in patterns.items():
                matches = re.findall(pattern, self.raw_text)
                if matches:
                    data_dict[field] = matches[0].strip()
            
            if data_dict:
                data_dicts.append(data_dict)
                
        self.extracted_data = data_dicts
        return data_dicts


class DataProcessor:
    """Process and format the extracted data"""
    
    def __init__(self, data_list):
        self.raw_data = data_list
        self.processed_data = []
        self.extraction_stats = {}
        
    def format_data(self):
        """Format all data according to US standards"""
        for item in self.raw_data:
            processed_item = {}
            
            # Process date fields (MM/DD/YYYY)
            for field in ['as_of_date', 'maturity_date']:
                if field in item:
                    try:
                        # Try different date formats
                        for fmt in ['%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%m/%d/%Y', '%Y-%m-%d']:
                            try:
                                date_obj = datetime.strptime(item[field], fmt)
                                processed_item[field] = date_obj.strftime('%m/%d/%Y')
                                break
                            except ValueError:
                                continue
                    except:
                        # If we can't parse it, keep the original
                        processed_item[field] = item[field]
            
            # Process currency fields (USD format with 2 decimal places)
            for field in ['investment_in_original', 'investment_in', 'investment_in_prior']:
                if field in item:
                    try:
                        # Remove any non-numeric characters except decimal point
                        value = re.sub(r'[^\d.]', '', item[field])
                        # Format as currency with 2 decimal places
                        processed_item[field] = "{:.2f}".format(float(value))
                    except:
                        processed_item[field] = item[field]
            
            # Process yield percentage
            if 'yield_percentage' in item:
                try:
                    value = re.sub(r'[^\d.]', '', item['yield_percentage'])
                    processed_item['yield_percentage'] = "{:.2f}".format(float(value))
                except:
                    processed_item['yield_percentage'] = item['yield_percentage']
            
            # Copy other fields as is
            for field in ['original_security_name', 'currency', 'sector', 'risk_rating']:
                if field in item:
                    processed_item[field] = item[field]
            
            self.processed_data.append(processed_item)
        
        return self.processed_data
    
    def calculate_statistics(self):
        """Calculate extraction statistics"""
        total_records = len(self.processed_data)
        if total_records == 0:
            self.extraction_stats = {
                "total_records": 0,
                "mandatory_fields_extracted": 0,
                "mandatory_fields_percentage": 0,
                "extraction_accuracy": 0,
                "missing_fields": CONFIG["extraction"]["mandatory_fields"],
                "inconsistent_data": []
            }
            return self.extraction_stats
        
        # Count mandatory fields
        mandatory_fields = CONFIG["extraction"]["mandatory_fields"]
        mandatory_field_counts = {field: 0 for field in mandatory_fields}
        
        for record in self.processed_data:
            for field in mandatory_fields:
                if field in record and record[field]:
                    mandatory_field_counts[field] += 1
        
        # Calculate percentages
        total_mandatory_fields = len(mandatory_fields) * total_records
        extracted_mandatory_fields = sum(mandatory_field_counts.values())
        
        mandatory_fields_percentage = (extracted_mandatory_fields / total_mandatory_fields) * 100 if total_mandatory_fields > 0 else 0
        
        # Identify missing and inconsistent data
        missing_fields = []
        for field, count in mandatory_field_counts.items():
            if count < total_records:
                missing_fields.append(f"{field} ({total_records - count} missing)")
        
        # Check for inconsistent data
        inconsistent_data = []
        
        # Example check: Verify if currencies are consistent
        currencies = set()
        for record in self.processed_data:
            if 'currency' in record and record['currency']:
                currencies.add(record['currency'])
        
        if len(currencies) > 1:
            inconsistent_data.append(f"Multiple currencies detected: {', '.join(currencies)}")
        
        # Calculate overall extraction accuracy (simplified)
        extraction_accuracy = mandatory_fields_percentage  # This can be refined with additional logic
        
        self.extraction_stats = {
            "total_records": total_records,
            "mandatory_fields_extracted": extracted_mandatory_fields,
            "mandatory_fields_percentage": mandatory_fields_percentage,
            "extraction_accuracy": extraction_accuracy,
            "missing_fields": missing_fields,
            "inconsistent_data": inconsistent_data
        }
        
        return self.extraction_stats


class DataStorage:
    """Store processed data in database and Excel file"""
    
    def __init__(self, processed_data, stats):
        self.data = processed_data
        self.stats = stats
        self.db_config = CONFIG["database"]
        self.excel_file = CONFIG["output"]["excel_file"]
        
    def create_dataframe(self):
        """Convert processed data to pandas DataFrame"""
        return pd.DataFrame(self.data)
    
    def store_in_database(self):
        """Store data in SQL database"""
        try:
            # Create database connection
            if self.db_config["type"] == "postgresql":
                connection_string = f"postgresql://{self.db_config['user']}:{self.db_config['password']}@{self.db_config['host']}:{self.db_config['port']}/{self.db_config['database']}"
            else:  # MySQL
                connection_string = f"mysql+pymysql://{self.db_config['user']}:{self.db_config['password']}@{self.db_config['host']}:{self.db_config['port']}/{self.db_config['database']}"
            
            engine = create_engine(connection_string)
            
            # Convert data to DataFrame
            df = self.create_dataframe()
            
            # Store in database
            df.to_sql('financial_data', engine, if_exists='replace', index=False)
            
            return True
        except Exception as e:
            print(f"Database storage error: {e}")
            return False
    
    def store_in_excel(self):
        """Store data in Excel file with two sheets"""
        try:
            # Create a Pandas Excel writer
            writer = pd.ExcelWriter(self.excel_file, engine='openpyxl')
            
            # Convert data to DataFrame
            df = self.create_dataframe()
            
            # Write data to "Extracted Data" sheet
            df.to_excel(writer, sheet_name='Extracted Data', index=False)
            
            # Create statistics DataFrame
            stats_data = {
                "Metric": [
                    "Total Records Processed",
                    "Mandatory Fields Extracted",
                    "Mandatory Fields Percentage",
                    "Extraction Accuracy",
                    "Missing Fields",
                    "Inconsistent Data"
                ],
                "Value": [
                    self.stats["total_records"],
                    self.stats["mandatory_fields_extracted"],
                    f"{self.stats['mandatory_fields_percentage']:.2f}%",
                    f"{self.stats['extraction_accuracy']:.2f}%",
                    ", ".join(self.stats["missing_fields"]) if self.stats["missing_fields"] else "None",
                    ", ".join(self.stats["inconsistent_data"]) if self.stats["inconsistent_data"] else "None"
                ]
            }
            
            stats_df = pd.DataFrame(stats_data)
            
            # Write statistics to "Statistics" sheet
            stats_df.to_excel(writer, sheet_name='Statistics', index=False)
            
            # Save the Excel file
            writer.close()
            
            return True
        except Exception as e:
            print(f"Excel storage error: {e}")
            return False


def main(file_path):
    """Main function to orchestrate the extraction, processing and storage"""
    # Step 1: Extract data from document
    print(f"Extracting data from {file_path}...")
    extractor = DocumentExtractor(file_path)
    raw_data = extractor.extract_data()
    
    if not raw_data:
        print("No data was extracted from the document.")
        return False
    
    print(f"Extracted {len(raw_data)} records.")
    
    # Step 2: Process and format data
    print("Processing and formatting data...")
    processor = DataProcessor(raw_data)
    processed_data = processor.format_data()
    stats = processor.calculate_statistics()
    
    # Step 3: Store data
    print("Storing data...")
    storage = DataStorage(processed_data, stats)
    
    # Attempt to store in database
    db_result = storage.store_in_database()
    if db_result:
        print(f"Data successfully stored in {CONFIG['database']['type']} database.")
    else:
        print(f"Failed to store data in database. Check your database connection settings.")
    
    # Store in Excel
    excel_result = storage.store_in_excel()
    if excel_result:
        print(f"Data successfully stored in Excel file: {CONFIG['output']['excel_file']}")
    else:
        print("Failed to store data in Excel file.")
    
    # Step 4: Print statistics
    print("\nExtraction Statistics:")
    print(f"Total Records: {stats['total_records']}")
    print(f"Mandatory Fields Extracted: {stats['mandatory_fields_extracted']}")
    print(f"Mandatory Fields Percentage: {stats['mandatory_fields_percentage']:.2f}%")
    print(f"Extraction Accuracy: {stats['extraction_accuracy']:.2f}%")
    
    if stats['missing_fields']:
        print(f"Missing Fields: {', '.join(stats['missing_fields'])}")
    else:
        print("Missing Fields: None")
    
    if stats['inconsistent_data']:
        print(f"Inconsistent Data: {', '.join(stats['inconsistent_data'])}")
    else:
        print("Inconsistent Data: None")
    
    return True


if __name__ == "__main__":
    # Example usage
    file_path = "example_financial_document.docx"  # Replace with your document path
    main(file_path)

Extracting data from example_financial_document.docx...


FileNotFoundError: [Errno 2] No such file or directory: 'example_financial_document.docx'

In [None]:
def create_sample_document():
    """
    Create a sample financial document for testing.
    Returns the path to the created document.
    """
    sample_text = """# QUARTERLY INVESTMENT REPORT
## Confidential Financial Document

**As of date:** 03/31/2024

### PORTFOLIO SUMMARY

| Security Name | Investment Amount | Prior Investment | Change (%) |
|---------------|------------------|-----------------|-----------|
| Global Tech Fund | $475,250.00 | $425,800.00 | +11.61% |
| Emerging Markets ETF | $180,500.75 | $194,325.00 | -7.11% |
| US Treasury Bond 2026 | $250,000.00 | $250,000.00 | 0.00% |

### DETAILED INVESTMENTS

#### INVESTMENT 1
**Original security name:** Global Technology Fund Class A  
**Investment in (original):** 400,000.00  
**Investment in:** 475,250.00  
**Investment in (prior):** 425,800.00  
**Currency:** USD  
**Sector:** Technology  
**Risk rating:** Moderate  
**Maturity date:** N/A  
**Yield percentage:** 2.45%  

#### INVESTMENT 2
**Original security name:** Emerging Markets ETF  
**Investment in (original):** 200,000.00  
**Investment in:** 180,500.75  
**Investment in (prior):** 194,325.00  
**Currency:** USD  
**Sector:** International  
**Risk rating:** High  
**Maturity date:** N/A  
**Yield percentage:** 3.85%  

#### INVESTMENT 3
**Original security name:** US Treasury Bond 2026  
**Investment in (original):** 250,000.00  
**Investment in:** 250,000.00  
**Investment in (prior):** 250,000.00  
**Currency:** USD  
**Sector:** Government  
**Risk rating:** Low  
**Maturity date:** 06/15/2026  
**Yield percentage:** 4.25%  
"""
    
    # Create a sample text file
    sample_file_path = "sample_financial_document.txt"
    with open(sample_file_path, 'w') as f:
        f.write(sample_text)
    
    return sample_file_path

# Create the sample document
sample_path = create_sample_document()
print(f"Created sample document: {sample_path}")

In [None]:
# Test extraction with the sample document
extractor = DocumentExtractor(sample_path)
extracted_data = extractor.extract_data()

print("\n--- Extracted Raw Data ---")
for i, item in enumerate(extracted_data):
    print(f"\nRecord {i+1}:")
    for key, value in item.items():
        print(f"  {key}: {value}")

In [None]:
# Process the extracted data
processor = DataProcessor(extracted_data)
processed_data = processor.format_data()
stats = processor.calculate_statistics()

print("\n--- Processed Data ---")
for i, item in enumerate(processed_data):
    print(f"\nRecord {i+1}:")
    for key, value in item.items():
        print(f"  {key}: {value}")

print("\n--- Extraction Statistics ---")
print(f"Total Records: {stats['total_records']}")
print(f"Mandatory Fields Extracted: {stats['mandatory_fields_extracted']}")
print(f"Mandatory Fields Percentage: {stats['mandatory_fields_percentage']:.2f}%")
print(f"Extraction Accuracy: {stats['extraction_accuracy']:.2f}%")

if stats['missing_fields']:
    print(f"Missing Fields: {', '.join(stats['missing_fields'])}")
else:
    print("Missing Fields: None")

if stats['inconsistent_data']:
    print(f"Inconsistent Data: {', '.join(stats['inconsistent_data'])}")
else:
    print("Inconsistent Data: None")

In [None]:
# Store data in Excel
storage = DataStorage(processed_data, stats)
excel_result = storage.store_in_excel()

if excel_result:
    print(f"\nData successfully stored in Excel file: {CONFIG['output']['excel_file']}")
    # Read and display Excel file contents to verify
    try:
        df = pd.read_excel(CONFIG['output']['excel_file'], sheet_name='Extracted Data')
        print("\n--- Excel Data Preview ---")
        display(df)  # Jupyter will nicely format this
        
        stats_df = pd.read_excel(CONFIG['output']['excel_file'], sheet_name='Statistics')
        print("\n--- Excel Statistics Preview ---")
        display(stats_df)  # Jupyter will nicely format this
    except Exception as e:
        print(f"Error reading Excel file: {e}")
else:
    print("Failed to store data in Excel file.")

In [None]:
# Store data in database
try:
    db_result = storage.store_in_database()
    if db_result:
        print(f"Data successfully stored in {CONFIG['database']['type']} database.")
        
        # Verify data in database
        connection_string = f"postgresql://{CONFIG['database']['user']}:{CONFIG['database']['password']}@{CONFIG['database']['host']}:{CONFIG['database']['port']}/{CONFIG['database']['database']}"
        engine = create_engine(connection_string)
        
        # Query the database to verify data was stored
        query = "SELECT * FROM financial_data"
        db_data = pd.read_sql(query, engine)
        
        print("\n--- Database Data Preview ---")
        display(db_data)
    else:
        print(f"Failed to store data in database.")
except Exception as e:
    print(f"Database error: {e}")