# File Parsing Module (Ground Truth)

This notebook handles:
- CSV parsing
- Excel parsing
- Text-based PDF parsing

Rules:
- NO AI here
- NO guessing
- NO financial logic
- Only convert files → structured tables

In [16]:
#generating sample csv to work with
import pandas as pd
import os
from fpdf import FPDF # You may need to pip install fpdf

# 1. Create directory
os.makedirs('data', exist_ok=True)

# 2. Define Sample Financial Data
data = [
    ["date", "category", "description", "amount", "type"],
    ["2024-01-01", "Revenue", "Product Sales", "5000.00", "credit"],
    ["2024-01-02", "Expenses", "Office Rent", "1200.00", "debit"],
    ["2024-01-05", "Revenue", "Consulting", "2500.00", "credit"],
    ["2024-01-10", "Expenses", "AWS Cloud", "300.00", "debit"],
    ["2024-01-15", "Receivables", "Invoice #104", "1500.00", "credit"]
]

# 3. Generate CSV
csv_path = r'E:\financial-health-ai\notebook\sample_finance.csv'
df = pd.DataFrame(data[1:], columns=data[0])
df.to_csv(csv_path, index=False)
print(f"✅ CSV Created: {csv_path}")

# 4. Generate PDF (formatted to match your parse_pdf logic)
# Your parser uses: columns = [c.strip() for c in line.split(",")]
pdf_path = r'E:\financial-health-ai\notebook\sample_finance.csv.pdf'
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=10)

for row in data:
    # Joining with commas so your line.split(",") works
    line_text = ",".join(row)
    pdf.cell(200, 10, txt=line_text, ln=True)

pdf.output(pdf_path)
print(f"✅ PDF Created: {pdf_path}")

✅ CSV Created: E:\financial-health-ai\notebook\sample_finance.csv
✅ PDF Created: E:\financial-health-ai\notebook\sample_finance.csv.pdf


In [7]:
import pandas as pd
import numpy as np
from typing import Dict, Any


In [8]:
def parse_csv(file_path: str) -> pd.DataFrame:
    """
    Parse CSV financial data into a DataFrame
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        raise ValueError(f"CSV parsing failed: {e}")


In [9]:
def parse_excel(file_path: str) -> pd.DataFrame:
    """
    Parse Excel financial data into a DataFrame
    """
    try:
        df = pd.read_excel(file_path)
        return df
    except Exception as e:
        raise ValueError(f"Excel parsing failed: {e}")


In [10]:
import pdfplumber


In [11]:
def parse_pdf(file_path: str) -> pd.DataFrame:
    """
    Parse text-based PDF into a DataFrame
    Assumes table-like structure
    """
    all_rows = []

    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if not text:
                    continue

                lines = text.split("\n")
                for line in lines:
                    # naive split (refined later)
                    columns = [c.strip() for c in line.split(",")]
                    all_rows.append(columns)

        df = pd.DataFrame(all_rows)
        return df

    except Exception as e:
        raise ValueError(f"PDF parsing failed: {e}")


In [12]:
def parse_file(file_path: str) -> pd.DataFrame:
    """
    Detect file type and parse accordingly
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError("File does not exist")

    extension = file_path.split(".")[-1].lower()

    if extension == "csv":
        return parse_csv(file_path)

    elif extension in ["xls", "xlsx"]:
        return parse_excel(file_path)

    elif extension == "pdf":
        return parse_pdf(file_path)

    else:
        raise ValueError("Unsupported file format")


In [17]:
def preview_data(df: pd.DataFrame, rows: int = 5):
    """
    Quick preview helper
    """
    return df.head(rows)


In [None]:
# Test CSV and PDF parsing
print("Testing CSV Parser...")
csv_df = parse_file(r"E:\financial-health-ai\notebook\sample_finance.csv")
print(csv_df.head())

# Test PDF
print("\nTesting PDF Parser...")
pdf_df = parse_file(r"E:\financial-health-ai\notebook\sample_finance.csv")

# Since PDF parsing results in a DataFrame with generic column names:
# Let's promote the first row to headers to match the CSV output
pdf_df.columns = pdf_df.iloc[0]
pdf_df = pdf_df[1:].reset_index(drop=True)

print(pdf_df.head())

Testing CSV Parser...
         date     category    description  amount    type
0  2024-01-01      Revenue  Product Sales  5000.0  credit
1  2024-01-02     Expenses    Office Rent  1200.0   debit
2  2024-01-05      Revenue     Consulting  2500.0  credit
3  2024-01-10     Expenses      AWS Cloud   300.0   debit
4  2024-01-15  Receivables   Invoice #104  1500.0  credit

Testing PDF Parser...
0  2024-01-01      Revenue Product Sales  5000.0  credit
0  2024-01-02     Expenses   Office Rent  1200.0   debit
1  2024-01-05      Revenue    Consulting  2500.0  credit
2  2024-01-10     Expenses     AWS Cloud   300.0   debit
3  2024-01-15  Receivables  Invoice #104  1500.0  credit
