In [1]:
import re
import pandas as pd
from datetime import datetime
from docx import Document
from openpyxl import Workbook
import psycopg2
import pyodbc

MANDATORY_FIELDS = [
    'As of date',
    'Original security name',
    'Investment in (original)',
    'Investment in',
    'Investment in (prior)',
    'Currency'
]

def format_date(value):
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y"):
        try:
            return datetime.strptime(value, fmt).strftime("%m/%d/%Y")
        except:
            continue
    return value

def format_currency(value):
    try:
        value = re.sub(r'[^\d.-]', '', value)
        return f"${float(value):,.2f}"
    except:
        return value

def extract_from_docx(file_path):
    doc = Document(file_path)
    raw_text = "\n".join([p.text for p in doc.paragraphs])

    extracted = {}
    for field in MANDATORY_FIELDS:
        match = re.search(fr"{re.escape(field)}[:\s]+(.+?)(?:\n|$)", raw_text, re.IGNORECASE)
        if match:
            extracted[field] = match.group(1).strip()
        else:
            extracted[field] = None

    if extracted['As of date']:
        extracted['As of date'] = format_date(extracted['As of date'])
    for k in ['Investment in (original)', 'Investment in', 'Investment in (prior)']:
        if extracted[k]:
            extracted[k] = format_currency(extracted[k])

    return extracted

def generate_excel(data_dict, output_path='output.xlsx'):
    df = pd.DataFrame([data_dict])
    total = len(MANDATORY_FIELDS)
    present = sum(1 for k in MANDATORY_FIELDS if data_dict.get(k))
    accuracy = round((present / total) * 100, 2)
    missing = [k for k in MANDATORY_FIELDS if not data_dict.get(k)]

    stats = {
        "Total Fields": total,
        "Fields Extracted": present,
        "Accuracy (%)": accuracy,
        "Missing Fields": ", ".join(missing)
    }

    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Extracted Data')
        pd.DataFrame([stats]).to_excel(writer, index=False, sheet_name='Statistics')
    print(f"Excel written to {output_path}")

def store_in_postgresql(data_dict, db_config):
    conn = psycopg2.connect(**db_config)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS investments (
            as_of_date TEXT,
            original_security_name TEXT,
            investment_in_original TEXT,
            investment_in TEXT,
            investment_in_prior TEXT,
            currency TEXT
        );
    """)
    cursor.execute("""
        INSERT INTO investments VALUES (%s, %s, %s, %s, %s, %s)
    """, tuple(data_dict.get(k) for k in MANDATORY_FIELDS))
    conn.commit()
    conn.close()

def store_in_sqlserver(data_dict, conn_str):
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()
    cursor.execute("""
        IF NOT EXISTS (SELECT * FROM sysobjects WHERE name='investments' and xtype='U')
        CREATE TABLE investments (
            as_of_date NVARCHAR(255),
            original_security_name NVARCHAR(255),
            investment_in_original NVARCHAR(255),
            investment_in NVARCHAR(255),
            investment_in_prior NVARCHAR(255),
            currency NVARCHAR(255)
        );
    """)
    cursor.execute("""
        INSERT INTO investments VALUES (?, ?, ?, ?, ?, ?)
    """, tuple(data_dict.get(k) for k in MANDATORY_FIELDS))
    conn.commit()
    conn.close()

# Main execution
if __name__ == '__main__':
    file_path = "AI&Automation Dev - Tech Assignment.docx"
    extracted_data = extract_from_docx(file_path)
    generate_excel(extracted_data)

    # Uncomment and configure one of the following to store in DB:
    # store_in_postgresql(extracted_data, db_config={
    #     'host': 'localhost', 'dbname': 'your_db', 'user': 'your_user', 'password': 'your_pass'
    # })

    # store_in_sqlserver(extracted_data, conn_str='DRIVER={SQL Server};SERVER=localhost;DATABASE=your_db;UID=your_user;PWD=your_pass')


ModuleNotFoundError: No module named 'docx'