In [1]:
import sys
from pathlib import Path

project_root = Path("/Users/priyamvadadaga/Agentics-EarningsCall/Agentics/applications/earnings_call_interpreter")

sys.path.append(str(project_root))

In [None]:
import pandas as pd
from src.utils.quarters import normalize_quarter

UNIVERSE_PATH = "../../data/universe/mapped_universe.parquet"
STATEMENTS_PATH = "../../data/processed/statements.parquet"

universe_df = pd.read_parquet(UNIVERSE_PATH)
universe_df = universe_df.dropna(subset=["companyid"]).copy()

if "ticker" not in universe_df.columns:
    raise ValueError("Expected 'ticker' column in mapped_universe.parquet")

if "name" not in universe_df.columns:
    raise ValueError("Expected 'name' column in mapped_universe.parquet")

universe_df["ticker"] = universe_df["ticker"].astype(str).str.upper()
universe_df["label"] = universe_df["ticker"] + " – " + universe_df["name"]

companyid_to_ticker = {
    float(row["companyid"]): str(row["ticker"]).upper()
    for _, row in universe_df.iterrows()
}

statements_df = pd.read_parquet(STATEMENTS_PATH).copy()
statements_df["ticker"] = statements_df["company_id"].map(companyid_to_ticker)
statements_df["quarter_str"] = statements_df["call_period"].apply(
    lambda cp: normalize_quarter(None, None, cp)
)

statements_df = statements_df.dropna(subset=["ticker", "quarter_str"])

AVAILABLE_QUARTERS = (
    statements_df.groupby("ticker")["quarter_str"]
    .apply(lambda s: sorted(set(s), reverse=True))
    .to_dict()
)

In [7]:
AVAILABLE_QUARTERS

{'AAPL': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'ACN': ['2025Q3', '2025Q2', '2025Q1'],
 'ADBE': ['2025Q3', '2025Q2', '2025Q1'],
 'AMAT': ['2025Q3', '2025Q2', '2025Q1'],
 'AMD': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'ASML': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'AVGO': ['2025Q3', '2025Q2', '2025Q1'],
 'CRM': ['2025Q3', '2025Q2', '2025Q1'],
 'CSCO': ['2025Q3', '2025Q2', '2025Q1'],
 'IBM': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'INTU': ['2025Q3', '2025Q2', '2025Q1'],
 'MSFT': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'NOW': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'NVDA': ['2025Q3', '2025Q2', '2025Q1'],
 'ORCL': ['2025Q3', '2025Q2', '2025Q1'],
 'PLTR': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'QCOM': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'SAP': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'TSM': ['2025Q4', '2025Q3', '2025Q2', '2025Q1'],
 'TXN': ['2025Q4', '2025Q3', '2025Q2', '2025Q1']}

In [None]:
from typing import List, Dict, Any
import pandas as pd

from src.rag.llm_as_a_judge import judge_answer_for_question

EVAL_CASES: List[Dict[str, Any]] = [
    {
        "id": "aapl_q1_tone_guidance",
        "question": (
            "In Q4 of 2025, did Apple sound more optimistic or cautious about the next quarter? Summarize their tone on guidance in 3–4 sentences."
        ),
        "company": "AAPL",
        "filing_type": "Earnings Call",
        "quarter": "2025Q4",
    }
]

def run_eval(cases: List[Dict[str, Any]]) -> pd.DataFrame:
    rows: List[Dict[str, Any]] = []

    for case in cases:
        result = judge_answer_for_question(
            question=case["question"],
            company=case["company"],
            filing_type=case["filing_type"],
            quarter=case["quarter"],
            reference_answer=case.get("reference_answer"),
            key_facts=case.get("key_facts"),
        )

        scores = result["judgment"]["scores"]
        rows.append(
            {
                "id": case["id"],
                "company": case["company"],
                "quarter": case["quarter"],
                "overall": scores["overall"],
                "relevance": scores["relevance"],
                "factual_correctness": scores["factual_correctness"],
                "groundedness": scores["groundedness"],
                "completeness": scores["completeness"],
                "clarity": scores["clarity"],
                "verdict": result["judgment"]["verdict"],
            }
        )

    return pd.DataFrame(rows)

In [13]:
eval = run_eval(EVAL_CASES)

In [14]:
eval

Unnamed: 0,id,company,quarter,overall,relevance,factual_correctness,groundedness,completeness,clarity,verdict
0,aapl_q1_tone_guidance,AAPL,2025Q4,0.85,1.0,1.0,1.0,1.0,0.7,"The model answer is highly relevant, directly ..."


In [15]:
eval.verdict[0]

"The model answer is highly relevant, directly addressing Apple's tone and guidance for the upcoming quarter. It provides a balanced view, detailing both optimistic projections and cautious notes with specific examples. However, it significantly exceeds the requested 3-4 sentence summary length, providing 7 sentences, which impacts its conciseness and thus its clarity score."