## Isolation tests for business data to metric pipe; extraction. 

--- 

1. A direct polars query for talking to N companies, ALL metrics -- just printing business truth values in dataframe. no pipeline logic.
2. A simple call query; which calls metric pipeline, for a hypothetical BIG question- talking abt those 7-8 companies + ALL metrics, and generates snapshot text.

In [1]:
from pathlib import Path
import sys
import logging

logging.getLogger().setLevel(logging.WARNING)

current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root on sys.path: {model_root}")


✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline


In [2]:

## =================================================================================================
##  SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
## =================================================================================================

from pathlib import Path
import sys
import polars as pl

# 1. Put ModelPipeline on sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))
print(f"✓ Model root on sys.path: {model_root}")


# Set maximum rows to display (None = unlimited)
pl.Config.set_tbl_rows(-1)  # or pl.Config.set_tbl_rows(None)

# Set maximum columns to display
pl.Config.set_tbl_cols(-1)

# Set maximum string column width (characters)
pl.Config.set_fmt_str_lengths(1000)  # or higher like 5000

# Set table width (total console width)
pl.Config.set_tbl_width_chars(2000)

# Optional: Set float precision for numeric display
pl.Config.set_float_precision(3)


## SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
from finrag_ml_tg1.rag_modules_src.utilities.supply_line_formatters import format_analytical_compact
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter

METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"
METRIC_DATA_FACT = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/KPI_FACT_DATA_EDGAR.parquet" 

metric_fact_data_schema = pl.read_parquet(METRIC_DATA_FACT).collect_schema()
metric_fact_data_schema
metric_fact_data = pl.read_parquet(METRIC_DATA_FACT)


dist_tickers = metric_fact_data.select(pl.col("ticker").unique()).sort("ticker")
dist_tickers
# dist_companies = metric_fact_data.select(pl.col("company_name").unique()).sort("company_name")



# MSFT NVDA GOOGLE COST AMZN 
target_companies = ["MSFT", "NVDA", "GOOGL", "COST", "AMZN"]

metrics_view_readable = (
    metric_fact_data
    .filter(
        pl.col("ticker").is_in(target_companies) &
        pl.col("year").is_between(2018, 2022)
    )
    .select(["ticker", "metric_key",
              #, "metric_code", "metric_gaap",
              "metric_label", "year", "value"])
    .with_columns([
        # Add human-readable formatted column
        pl.when(pl.col("value").abs() >= 1e9)
          .then((pl.col("value") / 1e9).round(2).cast(pl.Utf8) + "B")
        .when(pl.col("value").abs() >= 1e6)
          .then((pl.col("value") / 1e6).round(2).cast(pl.Utf8) + "M")
        .when(pl.col("value").abs() >= 1e3)
          .then((pl.col("value") / 1e3).round(2).cast(pl.Utf8) + "K")
        .otherwise(pl.col("value").round(2).cast(pl.Utf8))
        .alias("value_readable")
    ])
)

# metrics_view_readable


✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline


In [3]:
# Query to get distinct values for all 4 metric columns - handling nulls
metric_columns_dict = {}

for col in ['metric_key', 'metric_code', 'metric_gaap', 'metric_label']:
    # Get unique values, filter out nulls, sort, join
    values = (
        metric_fact_data
        .select(col)
        .unique()
        .filter(pl.col(col).is_not_null())  # Remove nulls
        .to_series()
        .to_list()
    )
    metric_columns_dict[col] = ', '.join(sorted(values))

# Print each as comma-separated string
for col_name, values_str in metric_columns_dict.items():
    print(f"\n{'='*80}")
    print(f"{col_name.upper()}:")
    print(f"{'='*80}")
    print(values_str)
    print(f"\nCount: {len(values_str.split(', ')) if values_str else 0} unique values")


METRIC_KEY:
amortization_year5, business_combination_liabilities, capex_intangibles, capex_net_productive, capex_productive, cfo, cfo_cont_ops, change_in_other_assets, debt_to_assets, debt_to_equity, deferred_taxes_other_current, derivative_liabilities_current, disposal_group_assets_current, disposal_group_gross_profit, disposal_group_liabilities_current, disposal_group_other_assets, disposal_group_other_liabilities, dta_capital_loss_cf, dta_deferred_income, dta_derivatives, dta_net, dta_nol, dta_nol_domestic, dta_nol_foreign, dta_nol_state_local, dta_oci_loss, dta_other, dta_ppe, dta_rnd, dta_unrealized_losses_afs, dtl_derivatives, dtl_goodwill_intangibles, dtl_investment_affiliates, dtl_ppe, dtl_undistributed_foreign, dtl_unrealized_gains_trading, employee_liabilities_current, employee_liabilities_total, equity, equity_incl_nci, equity_other, equity_other_shares, equity_stock_split, financial_asset_transfer, gross_profit, impairment_intangibles, impairment_long_lived_assets, intangi

In [5]:



# # MSFT NVDA GOOGLE COST AMZN 
# target_companies = ["MSFT", "NVDA", "GOOGL", "COST", "AMZN"]

# metrics_view_readable = (
#     metric_fact_data
#     .filter(
#         pl.col("ticker").is_in(target_companies) &
#         pl.col("year").is_between(2018, 2022)
#     )
#     .select(["ticker", "metric_key", "metric_label", "year", "value"])
#     .with_columns([
#         # Add human-readable formatted column
#         pl.when(pl.col("value").abs() >= 1e9)
#           .then((pl.col("value") / 1e9).round(2).cast(pl.Utf8) + "B")
#         .when(pl.col("value").abs() >= 1e6)
#           .then((pl.col("value") / 1e6).round(2).cast(pl.Utf8) + "M")
#         .when(pl.col("value").abs() >= 1e3)
#           .then((pl.col("value") / 1e3).round(2).cast(pl.Utf8) + "K")
#         .otherwise(pl.col("value").round(2).cast(pl.Utf8))
#         .alias("value_readable")
#     ])
# )

# # metrics_view_readable

# metric_fact_data

# dist_metrickey = metric_fact_data.select(pl.col("metric_key").unique()).sort("metric_key")
# dist_metrickey




In [5]:
"""

"total sales" → revenue
"bottom line profits" → net_income
"operating cash flows" → cfo (Operating Cash Flow)
"gross margins" → gross_profit_margin (ratio metric)
"total debt levels" → debt_to_assets or total liabilities
"shareholder equity" → equity
"cost of goods sold" → cogs (Cost of Revenue)
"tax expenses" → tax (Provision for Income Tax)
"return on assets" → roa
"earnings per share" → eps (needs mapping if available)

"""

'\n\n"total sales" → revenue\n"bottom line profits" → net_income\n"operating cash flows" → cfo (Operating Cash Flow)\n"gross margins" → gross_profit_margin (ratio metric)\n"total debt levels" → debt_to_assets or total liabilities\n"shareholder equity" → equity\n"cost of goods sold" → cogs (Cost of Revenue)\n"tax expenses" → tax (Provision for Income Tax)\n"return on assets" → roa\n"earnings per share" → eps (needs mapping if available)\n\n'

In [3]:
def debug_print_entities_and_metrics(query, entities, metric_result):
    print("\n" + "="*80)
    print("QUERY")
    print("="*80)
    print(query)

    # ---------- EntityAdapter result ----------
    print("\n[ENTITY EXTRACTION]")
    companies = ", ".join(entities.companies.tickers) or "(none)"
    years     = ", ".join(str(y) for y in entities.years.years) or "(none)"
    sections  = ", ".join(entities.sections) or "(none)"

    # metrics from EntityAdapter; each entry is usually a MetricCandidate
    metric_list = []
    if getattr(entities, "metrics", None) and getattr(entities.metrics, "metrics", None):
        for m in entities.metrics.metrics:
            metric_id = getattr(m, "metric_id", None) or getattr(m, "metric_name", None) or str(m)
            metric_list.append(metric_id)
    metrics = ", ".join(metric_list) if metric_list else "(none)"

    print(f"  Companies: {companies}")
    print(f"  Years:     {years}")
    print(f"  Sections:  {sections}")
    print(f"  Metrics:   {metrics}")

    # ---------- MetricPipeline result ----------
    print("\n[METRIC PIPELINE FILTERS]")
    if metric_result.get("success"):
        f = metric_result.get("filters", {})
        f_tickers = ", ".join(f.get("tickers", [])) or "(none)"
        f_years   = ", ".join(str(y) for y in f.get("years", [])) or "(none)"
        f_metrics = ", ".join(f.get("metrics", [])) or "(none)"

        print(f"  Tickers: {f_tickers}")
        print(f"  Years:   {f_years}")
        print(f"  Metrics: {f_metrics}")
        print(f"  Records returned: {metric_result.get('count', 0)}")
    else:
        reason = metric_result.get("reason", "metric layer not activated")
        print(f"  Status: {reason}")
        if "extracted_filters" in metric_result:
            ef = metric_result["extracted_filters"]
            ef_tickers = ", ".join(ef.get("tickers", [])) or "(none)"
            ef_years   = ", ".join(str(y) for y in ef.get("years", [])) or "(none)"
            ef_metrics = ", ".join(ef.get("metrics", [])) or "(none)"
            print(f"  Extracted (pre-check): tickers={ef_tickers}, years={ef_years}, metrics={ef_metrics}")



## =================================================================================================
##  SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
""" query = "In the MD&A and Risk Factors sections, how did NVIDIA and Microsoft discuss their AI strategy, competitive positioning, and supply chain risks between 2020 and 2023?"
"""
## =================================================================================================

from pathlib import Path
import sys

# 1. Put ModelPipeline on sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))
print(f"✓ Model root on sys.path: {model_root}")


## SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
from finrag_ml_tg1.rag_modules_src.utilities.supply_line_formatters import format_analytical_compact
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter

METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"

METRIC_DATA_FACT = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/KPI_FACT_DATA_EDGAR.parquet" 


adapter =  EntityAdapter( company_dim_path=DIM_COMPANIES, section_dim_path=DIM_SECTIONS )

metric_pipeline = MetricPipeline(data_path=str(METRIC_DATA_FACT), company_dim_path=str(DIM_COMPANIES))


# ## need to quickly access entities. -> and metric process.-> whatever years, sections, companies, etc. have been extracted.
# debug_print_entities_and_metrics(query, entities, result)

query = (
    "Show me Apple, Microsoft, Amazon, Alphabet, Google, and Tesla's financial performance "
    "from 2018 to 2022. I need their total sales, bottom line profits, "
    "operating cash flows, gross margins, total debt levels, shareholder equity, "
    "cost of goods sold, tax expenses, return on assets, and earnings per share. "
    "How did these companies explain their revenue growth and profitability trends "
    "in their MD&A sections, and what supply chain or competitive risks did they "
    "highlight in their Risk Factors?"
)
entities = adapter.extract(query)
result = metric_pipeline.process(query)

entity_meta = { "companies": list(entities.companies.tickers), "years": list(entities.years.years), "sections": list(entities.sections), }
compact = format_analytical_compact(result, entity_meta=entity_meta)

print("\n" + "="*80)
print("SUPPLY LINE 1 – KPI SNAPSHOT")
print("="*80)
print(compact or "(no KPI data returned)")


print("\n" + "="*80)
print("DEBUG: RESULT OBJECT")
print("="*80)
print(f"Success: {result.get('success')}")
print(f"Count: {result.get('count')}")
print(f"Has 'data' key: {'data' in result}")
if 'data' in result:
    print(f"Data length: {len(result['data'])}")
    print(f"First record: {result['data'][0] if result['data'] else 'EMPTY'}")
print("="*80)


## debug now.
debug_print_entities_and_metrics(query, entities, result)

✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ Loaded 3,714 records from KPI_FACT_DATA_EDGAR.parquet (20 tickers, 2016-2025)

SUPPLY LINE 1 – KPI SNAPSHOT
══════════════════════════════════════════════════════════════════════
KPI SNAPSHOT - METRIC PIPELINE OUTPUT
══════════════════════════════════════════════════════════════════════

Scope:
  Companies (entities): AAPL, AMZN, GOOGL, MSFT, TSLA
  Years (entities):     2018, 2019, 2020, 2021, 2022
  Sections (entities):  ITEM_1A, ITEM_7
  Companies (metrics):  AAPL, AMZN, GOOGL, MSFT, TSLA
  Years (metrics):      2018, 2019, 2020, 2021, 2022
  Metrics:              Operating Cash Flow, Stockholders' Equity, Gross Profit, Debt-to-Assets, ROA % (Avg Assets), Revenue, Net Income
  Coverage:             1

In [6]:

## =================================================================================================
##  BUSINESS QUERY VALIDATION WITH SRC TABLE.
## =================================================================================================

from pathlib import Path
import sys
import polars as pl

# 1. Put ModelPipeline on sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))
print(f"✓ Model root on sys.path: {model_root}")


# Set maximum rows to display (None = unlimited)
pl.Config.set_tbl_rows(-1)  # or pl.Config.set_tbl_rows(None)

# Set maximum columns to display
pl.Config.set_tbl_cols(-1)

# Set maximum string column width (characters)
pl.Config.set_fmt_str_lengths(1000)  # or higher like 5000

# Set table width (total console width)
pl.Config.set_tbl_width_chars(2000)

# Optional: Set float precision for numeric display
pl.Config.set_float_precision(3)


## SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
from finrag_ml_tg1.rag_modules_src.utilities.supply_line_formatters import format_analytical_compact
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter

METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"
METRIC_DATA_FACT = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/KPI_FACT_DATA_EDGAR.parquet" 

metric_fact_data_schema = pl.read_parquet(METRIC_DATA_FACT).collect_schema()
metric_fact_data_schema
metric_fact_data = pl.read_parquet(METRIC_DATA_FACT)


dist_tickers = metric_fact_data.select(pl.col("ticker").unique()).sort("ticker")
dist_tickers
# dist_companies = metric_fact_data.select(pl.col("company_name").unique()).sort("company_name")



# MSFT NVDA GOOGLE COST AMZN 
target_companies = ["MSFT", "NVDA", "GOOGL", "TSLA", "AAPL", "AMZN"]

metrics_view_readable = (
    metric_fact_data
    .filter(
        pl.col("ticker").is_in(target_companies) &
        pl.col("year").is_between(2018, 2022)
    )
    .select(["ticker", "metric_key",
              #, "metric_code", "metric_gaap",
              "metric_label", "year", "value"])
    .with_columns([
        # Add human-readable formatted column
        pl.when(pl.col("value").abs() >= 1e9)
          .then((pl.col("value") / 1e9).round(2).cast(pl.Utf8) + "B")
        .when(pl.col("value").abs() >= 1e6)
          .then((pl.col("value") / 1e6).round(2).cast(pl.Utf8) + "M")
        .when(pl.col("value").abs() >= 1e3)
          .then((pl.col("value") / 1e3).round(2).cast(pl.Utf8) + "K")
        .otherwise(pl.col("value").round(2).cast(pl.Utf8))
        .alias("value_readable")
    ])

    .filter( 
        pl.col("metric_label").is_in([
            "Operating Cash Flow", 
            "Gross Profit", 
            "Debt-to-Assets"
            ]) )
)


"""

AAPL:
  2018: Operating Cash Flow=$77.4B, Gross Profit=$24.1B, Debt-to-Assets=$1, ROA % (Avg Assets)=$17, Revenue=$62.9B, Net Income=$14.1B
  2019: Operating Cash Flow=$69.4B, Gross Profit=$24.3B, Debt-to-Assets=$1, ROA % (Avg Assets)=$17, Net Income=$13.7B
  2020: Operating Cash Flow=$80.7B, Gross Profit=$24.7B, Debt-to-Assets=$1, ROA % (Avg Assets)=$17, Net Income=$12.7B
  2021: Operating Cash Flow=$104.0B, Gross Profit=$152.8B, Debt-to-Assets=$1, ROA % (Avg Assets)=$27, Net Income=$94.7B
  2022: Operating Cash Flow=$122.2B, Gross Profit=$170.8B, Debt-to-Assets=$1, ROA % (Avg Assets)=$28, Net Income=$99.8B

AMZN:
  2018: Operating Cash Flow=$30.7B, ROA % (Avg Assets)=$5, Net Income=$3.0B
  2019: Operating Cash Flow=$38.5B, ROA % (Avg Assets)=$4, Net Income=$3.3B
  2020: Operating Cash Flow=$66.1B, ROA % (Avg Assets)=$6, Net Income=$7.2B
  2021: Operating Cash Flow=$46.3B, ROA % (Avg Assets)=$8, Net Income=$33.4B
  2022: Operating Cash Flow=$46.8B, ROA % (Avg Assets)=$-1, Net Income=$-2.7B

GOOGL:
  2018: Operating Cash Flow=$48.0B, Debt-to-Assets=$0, ROA % (Avg Assets)=$12, Net Income=$30.7B
  2019: Operating Cash Flow=$54.5B, Debt-to-Assets=$0, ROA % (Avg Assets)=$12, Net Income=$34.3B
  2020: Operating Cash Flow=$65.1B, Debt-to-Assets=$0, ROA % (Avg Assets)=$12, Revenue=$182.5B, Net Income=$40.3B
  2021: Operating Cash Flow=$91.7B, Debt-to-Assets=$0, ROA % (Avg Assets)=$21, Revenue=$257.6B, Net Income=$76.0B
  2022: Operating Cash Flow=$91.5B, Debt-to-Assets=$0, ROA % (Avg Assets)=$16, Net Income=$60.0B

MSFT:
  2018: Operating Cash Flow=$43.9B, Gross Profit=$20.3B, Debt-to-Assets=$1, ROA % (Avg Assets)=$6, Net Income=$8.9B
  2019: Operating Cash Flow=$52.2B, Gross Profit=$23.3B, Debt-to-Assets=$1, ROA % (Avg Assets)=$13, Net Income=$13.2B
  2020: Operating Cash Flow=$60.7B, Gross Profit=$25.7B, Debt-to-Assets=$1, ROA % (Avg Assets)=$14, Net Income=$11.2B
  2021: Operating Cash Flow=$76.7B, Gross Profit=$115.9B, Debt-to-Assets=$1, ROA % (Avg Assets)=$18, Net Income=$61.3B
  2022: Operating Cash Flow=$89.0B, Gross Profit=$135.6B, Debt-to-Assets=$1, ROA % (Avg Assets)=$19, Net Income=$72.7B

TSLA:
  2018: Operating Cash Flow=$2.1B, Gross Profit=$1.4B, Revenue=$7.2B
  2019: Operating Cash Flow=$2.4B, Gross Profit=$1.4B, Revenue=$7.4B
  2020: Operating Cash Flow=$5.9B, Gross Profit=$2.1B, Debt-to-Assets=$1, ROA % (Avg Assets)=$1, Revenue=$10.7B
  2021: Operating Cash Flow=$11.5B, Gross Profit=$13.6B, Debt-to-Assets=$0, ROA % (Avg Assets)=$8, Revenue=$53.8B
  2022: Operating Cash Flow=$14.7B, Gross Profit=$20.9B, Debt-to-Assets=$0, ROA % (Avg Assets)=$13, Revenue=$81.5B
"""

metrics_view_readable



✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline


ticker,metric_key,metric_label,year,value,value_readable
str,str,str,f64,f64,str
"""TSLA""","""cfo""","""Operating Cash Flow""",2018.0,2097802000.0,"""2.1B"""
"""TSLA""","""cfo""","""Operating Cash Flow""",2019.0,2405000000.0,"""2.41B"""
"""TSLA""","""cfo""","""Operating Cash Flow""",2020.0,5943000000.0,"""5.94B"""
"""TSLA""","""cfo""","""Operating Cash Flow""",2021.0,11497000000.0,"""11.5B"""
"""TSLA""","""cfo""","""Operating Cash Flow""",2022.0,14724000000.0,"""14.72B"""
"""TSLA""","""gross_profit""","""Gross Profit""",2018.0,1442900000.0,"""1.44B"""
"""TSLA""","""gross_profit""","""Gross Profit""",2019.0,1391000000.0,"""1.39B"""
"""TSLA""","""gross_profit""","""Gross Profit""",2020.0,2066000000.0,"""2.07B"""
"""TSLA""","""gross_profit""","""Gross Profit""",2021.0,13606000000.0,"""13.61B"""
"""TSLA""","""gross_profit""","""Gross Profit""",2022.0,20853000000.0,"""20.85B"""


## Full diagnostic: Quick

In [4]:
# Full diagnostic
print("="*80)
print("DIAGNOSTIC CHECK")
print("="*80)

# 1. What metrics did extractor produce?
filters = metric_pipeline.extractor.extract(query)
print("\n1. Extracted Filters:")
print(f"   Tickers: {filters['tickers']}")
print(f"   Years: {filters['years']}")
print(f"   Metrics: {filters['metrics']}")

# 2. Do these exact values exist in parquet?
print("\n2. Data Availability:")
for ticker in filters['tickers']:
    for year in filters['years']:
        for metric in filters['metrics']:
            count = (
                metric_fact_data
                .filter(
                    (pl.col("ticker") == ticker) &
                    (pl.col("year") == float(year)) &  # Convert to float!
                    (pl.col("metric_label") == metric)
                )
                .height
            )
            if count > 0:
                print(f"   ✓ {ticker} | {year} | {metric}: {count} records")
            else:
                print(f"   ✗ {ticker} | {year} | {metric}: NOT FOUND")

# 3. Show sample of what IS available
print("\n3. Sample Available Data:")
sample = (
    metric_fact_data
    .filter(pl.col("ticker").is_in(filters['tickers']))
    .filter(pl.col("year").is_in([float(y) for y in filters['years']]))
    .select(["ticker", "year", "metric_label"])
    .unique()
    .head(20)
)
print(sample)

DIAGNOSTIC CHECK

1. Extracted Filters:
   Tickers: ['AAPL', 'AMZN', 'GOOGL', 'MSFT', 'TSLA']
   Years: [2018, 2019, 2020, 2021, 2022]
   Metrics: ["Stockholders' Equity", 'ROA % (Avg Assets)', 'Revenue', 'Net Income']

2. Data Availability:
   ✗ AAPL | 2018 | Stockholders' Equity: NOT FOUND
   ✓ AAPL | 2018 | ROA % (Avg Assets): 1 records
   ✓ AAPL | 2018 | Revenue: 1 records
   ✓ AAPL | 2018 | Net Income: 1 records
   ✗ AAPL | 2019 | Stockholders' Equity: NOT FOUND
   ✓ AAPL | 2019 | ROA % (Avg Assets): 1 records
   ✗ AAPL | 2019 | Revenue: NOT FOUND
   ✓ AAPL | 2019 | Net Income: 1 records
   ✗ AAPL | 2020 | Stockholders' Equity: NOT FOUND
   ✓ AAPL | 2020 | ROA % (Avg Assets): 1 records
   ✗ AAPL | 2020 | Revenue: NOT FOUND
   ✓ AAPL | 2020 | Net Income: 1 records
   ✗ AAPL | 2021 | Stockholders' Equity: NOT FOUND
   ✓ AAPL | 2021 | ROA % (Avg Assets): 1 records
   ✗ AAPL | 2021 | Revenue: NOT FOUND
   ✓ AAPL | 2021 | Net Income: 1 records
   ✗ AAPL | 2022 | Stockholders' Equity: N