# Financial Metrics Calculation (Ground Truth)

Input:
- Clean pandas DataFrame from file_parser

Output:
- Dictionary of financial metrics

Rules:
- Deterministic math only
- No AI
- No risk judgement yet



In [1]:
import pandas as pd
from typing import Dict

In [2]:
REQUIRED_COLUMNS = {
    "date",
    "category",
    "description",
    "amount",
    "type"   # credit / debit
}


In [3]:
def validate_dataframe(df: pd.DataFrame):
    missing = REQUIRED_COLUMNS - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


In [4]:
REQUIRED_COLUMNS = {"date", "category", "amount", "type"}

In [5]:
def validate_dataframe(df: pd.DataFrame):
    missing = REQUIRED_COLUMNS - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

In [6]:
def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["amount"] = pd.to_numeric(df["amount"], errors="coerce").fillna(0)
    df["type"] = df["type"].str.lower().str.strip()
    df["category"] = df["category"].str.strip()

    return df


In [7]:
def calculate_total_revenue(df: pd.DataFrame) -> float:
    revenue_df = df[df["type"] == "credit"]
    return revenue_df["amount"].sum()


In [8]:
def calculate_total_expenses(df: pd.DataFrame) -> float:
    expense_df = df[df["type"] == "debit"]
    return expense_df["amount"].sum()


In [9]:
def calculate_net_cashflow(df: pd.DataFrame) -> float:
    revenue = calculate_total_revenue(df)
    expenses = calculate_total_expenses(df)
    return revenue - expenses


In [10]:
def category_breakdown(df: pd.DataFrame) -> Dict[str, float]:
    grouped = df.groupby("category")["amount"].sum()
    return grouped.to_dict()


In [11]:
def monthly_cashflow(df: pd.DataFrame) -> Dict[str, float]:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["month"] = df["date"].dt.to_period("M").astype(str)

    monthly = df.groupby("month").apply(
        lambda x: x[x["type"] == "credit"]["amount"].sum()
                - x[x["type"] == "debit"]["amount"].sum()
    )

    return monthly.to_dict()


In [12]:
def compute_financial_metrics(df: pd.DataFrame) -> Dict:
    validate_dataframe(df)
    df = normalize_dataframe(df)

    metrics = {
        "total_revenue": calculate_total_revenue(df),
        "total_expenses": calculate_total_expenses(df),
        "net_cashflow": calculate_net_cashflow(df),
        "category_breakdown": category_breakdown(df),
        "monthly_cashflow": monthly_cashflow(df)
    }

    return metrics


In [13]:
#test code 
import os
def parse_csv(file_path: str) -> pd.DataFrame:
    """
    Parse CSV financial data into a DataFrame
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        raise ValueError(f"CSV parsing failed: {e}")
def parse_file(file_path: str) -> pd.DataFrame:
    """
    Detect file type and parse accordingly
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError("File does not exist")

    extension = file_path.split(".")[-1].lower()

    if extension == "csv":
        return parse_csv(file_path)


    else:
        raise ValueError("Unsupported file format")

# Using your already tested CSV
df = parse_file(r"E:\financial-health-ai\notebook\sample_finance.csv")

metrics = compute_financial_metrics(df)
metrics


{'total_revenue': np.float64(9000.0),
 'total_expenses': np.float64(1500.0),
 'net_cashflow': np.float64(7500.0),
 'category_breakdown': {'Expenses': 1500.0,
  'Receivables': 1500.0,
  'Revenue': 7500.0},
 'monthly_cashflow': {'2024-01': 7500.0}}

# STEP 3: Financial Risk Engine (Rule-Based)

Purpose:
- Convert calculated financial metrics into understandable risk signals
- Evaluate credit risk, cash flow risk, and financial stability

Key Principles:
- 100% deterministic logic
- No AI, no guessing, no hallucination
- Transparent rules (judge-friendly)
- Same logic works across industries

Input:
- Output dictionary from compute_financial_metrics()

Output:
- Risk levels + explanations (machine-readable)


In [14]:
RISK_THRESHOLDS = {
    "profit_margin": {
        "high": 0.20,
        "medium": 0.10
    },
    "cashflow": {
        "negative": 0
    },
    "expense_ratio": {
        "high": 0.80,
        "medium": 0.60
    }
}


In [15]:
#Profitability Risk Assessment
def assess_profitability_risk(total_revenue: float, total_expenses: float):
    if total_revenue == 0:
        return {
            "level": "High",
            "reason": "No revenue recorded"
        }

    profit_margin = (total_revenue - total_expenses) / total_revenue

    if profit_margin < RISK_THRESHOLDS["profit_margin"]["medium"]:
        level = "High"
    elif profit_margin < RISK_THRESHOLDS["profit_margin"]["high"]:
        level = "Medium"
    else:
        level = "Low"

    return {
        "level": level,
        "profit_margin": round(profit_margin, 2),
        "reason": f"Profit margin at {round(profit_margin * 100, 1)}%"
    }


In [16]:
#Cash Flow Risk Assessment
def assess_cashflow_risk(net_cashflow: float):
    if net_cashflow < RISK_THRESHOLDS["cashflow"]["negative"]:
        return {
            "level": "High",
            "reason": "Negative cash flow"
        }
    elif net_cashflow == 0:
        return {
            "level": "Medium",
            "reason": "Break-even cash flow"
        }
    else:
        return {
            "level": "Low",
            "reason": "Positive cash flow"
        }


In [17]:
#Expense Load Risk
def assess_expense_risk(total_revenue: float, total_expenses: float):
    if total_revenue == 0:
        return {
            "level": "High",
            "reason": "Expenses without revenue"
        }

    expense_ratio = total_expenses / total_revenue

    if expense_ratio > RISK_THRESHOLDS["expense_ratio"]["high"]:
        level = "High"
    elif expense_ratio > RISK_THRESHOLDS["expense_ratio"]["medium"]:
        level = "Medium"
    else:
        level = "Low"

    return {
        "level": level,
        "expense_ratio": round(expense_ratio, 2),
        "reason": f"Expenses are {round(expense_ratio * 100, 1)}% of revenue"
    }


In [18]:
def aggregate_risk_levels(risks: dict):
    levels = [r["level"] for r in risks.values()]

    if "High" in levels:
        return "High"
    elif "Medium" in levels:
        return "Medium"
    else:
        return "Low"


In [19]:
def evaluate_financial_risk(metrics: dict) -> dict:
    profitability_risk = assess_profitability_risk(
        metrics["total_revenue"],
        metrics["total_expenses"]
    )

    cashflow_risk = assess_cashflow_risk(
        metrics["net_cashflow"]
    )

    expense_risk = assess_expense_risk(
        metrics["total_revenue"],
        metrics["total_expenses"]
    )

    risks = {
        "profitability": profitability_risk,
        "cashflow": cashflow_risk,
        "expense_load": expense_risk
    }

    overall_risk = aggregate_risk_levels(risks)

    return {
        "overall_risk": overall_risk,
        "risk_breakdown": risks
    }


In [20]:
#test code 
# Step 1: Parse file
df = parse_file(r"E:\financial-health-ai\notebook\sample_finance.csv")

# Step 2: Compute metrics
metrics = compute_financial_metrics(df)

# Step 3: Evaluate risk
risk_report = evaluate_financial_risk(metrics)

risk_report


{'overall_risk': 'Low',
 'risk_breakdown': {'profitability': {'level': 'Low',
   'profit_margin': np.float64(0.83),
   'reason': 'Profit margin at 83.3%'},
  'cashflow': {'level': 'Low', 'reason': 'Positive cash flow'},
  'expense_load': {'level': 'Low',
   'expense_ratio': np.float64(0.17),
   'reason': 'Expenses are 16.7% of revenue'}}}

## STEP 4: AI Financial Analyst (Narrative & Recommendations)

In this step, we introduce AI in a **controlled and safe way**.

Key principles:
- AI does NOT compute numbers
- AI does NOT change data
- AI only explains, summarizes, and recommends
- Deterministic logic (metrics + risk engine) remains the source of truth

Input to AI:
- Computed financial metrics
- Detected risk flags
- High-level business context

Output from AI:
- Plain-English financial summary
- Risk explanations
- Actionable recommendations for SME owners


In [21]:
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.messages import SystemMessage, HumanMessage



In [37]:
print(f"DEBUG: Metrics sent: {metrics}")
print(f"DEBUG: Risks sent: {risk_report}")

DEBUG: Metrics sent: {'total_revenue': np.float64(9000.0), 'total_expenses': np.float64(1500.0), 'net_cashflow': np.float64(7500.0), 'category_breakdown': {'Expenses': 1500.0, 'Receivables': 1500.0, 'Revenue': 7500.0}, 'monthly_cashflow': {'2024-01': 7500.0}}
DEBUG: Risks sent: {'overall_risk': 'Low', 'risk_breakdown': {'profitability': {'level': 'Low', 'profit_margin': np.float64(0.83), 'reason': 'Profit margin at 83.3%'}, 'cashflow': {'level': 'Low', 'reason': 'Positive cash flow'}, 'expense_load': {'level': 'Low', 'expense_ratio': np.float64(0.17), 'reason': 'Expenses are 16.7% of revenue'}}}


In [None]:
# Load environment variables
load_dotenv()

# Initialize chat model (choose ONE provider you actually use)
model = init_chat_model(
    "google_genai:gemini-flash-latest",
    api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3,
    max_tokens=800
)
#response=model.invoke ("capital of jharkhand")
#print (response.content)

The capital of Jharkhand is **Ranchi**.


In [38]:
def extract_risk_flags(risk_report: dict) -> list[str]:
    flags = []
    for area, details in risk_report["risk_breakdown"].items():
        flags.append(f"{area.capitalize()} risk: {details['level']} ({details['reason']})")
    return flags


In [39]:
def normalize_metrics(metrics: dict) -> dict:
    clean = {}
    for k, v in metrics.items():
        if hasattr(v, "item"):
            clean[k] = float(v)
        else:
            clean[k] = v
    return clean


In [30]:
def build_ai_context(metrics: dict, risks: list[str]) -> str:
    """
    Convert deterministic outputs into a clean AI-readable context.
    """
    lines = ["FINANCIAL METRICS:"]
    for k, v in metrics.items():
        lines.append(f"- {k}: {v}")

    lines.append("\nDETECTED RISK FLAGS:")
    if risks:
        for r in risks:
            lines.append(f"- {r}")
    else:
        lines.append("- No major risks detected")

    return "\n".join(lines)


In [41]:
def generate_ai_financial_summary(metrics: dict, risks: list[str]) -> str:
    context = build_ai_context(metrics, risks)

    messages = [
        SystemMessage(
            content=(
                "You are a senior financial analyst for small and medium enterprises in India. "
                "Explain financial health in simple language for non-finance business owners. "
                "Do not invent numbers. Base your response ONLY on provided data."
            )
        ),
        HumanMessage(
            content=f"""
Here is the financial snapshot of a business:

{context}

Please provide:
1. Overall financial health summary
2. Explanation of risks (if any)
3. Practical improvement suggestions
"""
        )
    ]

    response = model.invoke(messages)
        # ✅ SAFE extraction (this is the key fix)
    if hasattr(response, "content") and response.content:
        return str(response.content)

    return "⚠️ AI returned an empty response."


In [42]:
clean_metrics = normalize_metrics(metrics)
risk_flags = extract_risk_flags(risk_report)

ai_summary = generate_ai_financial_summary(clean_metrics, risk_flags)
print(ai_summary)


⚠️ AI returned an empty response.


In [49]:
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.messages import SystemMessage, HumanMessage

# -----------------------------
# Load environment variables
# -----------------------------
load_dotenv()

# -----------------------------
# Initialize Gemini model (REFINED)
# -----------------------------
# Using model_kwargs ensures 'thinking' and 'mime_type' are passed
# directly to the Google Generative AI SDK without being filtered by LangChain.
model = init_chat_model(
    "google_genai:gemini-2.5-flash",
    api_key=os.getenv("GOOGLE_API_KEY"),
     
    max_output_tokens=1000,

)

# -----------------------------
# Helpers
# -----------------------------
def extract_risk_flags(risk_report: dict) -> list[str]:
    return [
        f"{area.capitalize()} risk: {details['level']} ({details['reason']})"
        for area, details in risk_report["risk_breakdown"].items()
    ]


def normalize_metrics(metrics: dict) -> dict:
    return {
        k: float(v) if hasattr(v, "item") else v
        for k, v in metrics.items()
    }


def build_ai_context(metrics: dict, risks: list[str]) -> str:
    lines = ["FINANCIAL METRICS:"]
    for k, v in metrics.items():
        lines.append(f"- {k}: {v}")

    lines.append("\nDETECTED RISK FLAGS:")
    if risks:
        for r in risks:
            lines.append(f"- {r}")
    else:
        lines.append("- No major risks detected")

    return "\n".join(lines)

# -----------------------------
# Robust Response Extraction
# -----------------------------
def extract_text_from_response(response) -> str:
    """
    Handles standard strings, list of dicts, and
    the new 'thought' blocks in Gemini 2.5/3.
    """
    if not response or not hasattr(response, "content"):
        return ""

    # Case 1: Standard string
    if isinstance(response.content, str):
        return response.content.strip()

    # Case 2: List of content parts (Gemini standard)
    if isinstance(response.content, list):
        text_parts = []
        for part in response.content:
            if isinstance(part, dict):
                # Ignore hidden reasoning / thought blocks
                if part.get("type") == "text":
                    text_parts.append(part.get("text", ""))
            elif isinstance(part, str):
                text_parts.append(part)
        return "\n".join(text_parts).strip()

    return str(response.content).strip()

# -----------------------------
# AI Summary Generator
# -----------------------------
def generate_ai_financial_summary(metrics: dict, risks: list[str]) -> str:
    context = build_ai_context(metrics, risks)

    messages = [
        SystemMessage(
            content=(
                "You are a senior financial analyst for small and medium enterprises in India. "
                "Explain financial health clearly and simply for non-finance business owners. "
                "Base your response ONLY on the provided data. "
                "Do not mention internal reasoning. "
                "Write complete, well-structured paragraphs."
            )
        ),
        HumanMessage(
            content=(
                "Here is the financial snapshot of a business:\n\n"
                f"{context}\n\n"
                "Please provide:\n"
                "1. Overall financial health summary\n"
                "2. Explanation of risks (if any)\n"
                "3. Practical improvement suggestions"
            )
        ),
    ]

    response = model.invoke(messages)
    text = extract_text_from_response(response)

    if not text:
        return "❌ No readable text returned by Gemini."

    return text

# -----------------------------
# Usage
# -----------------------------
clean_metrics = normalize_metrics(metrics)
risk_flags = extract_risk_flags(risk_report)

ai_summary = generate_ai_financial_summary(clean_metrics, risk_flags)

print("\n===== AI FINANCIAL SUMMARY =====\n")
print(ai_summary)


ChatGoogleGenerativeAIError: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 8.619250491s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '20'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '8s'}]}}