In [34]:
import os
import json
import random
from pathlib import Path
import pandas as pd
import google.generativeai as genai
import pdfplumber

# 🔑 Set up Gemini
GEMINI_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel("gemini-1.5-pro")

In [35]:
# 📁 Paths
pdf_path = Path("../data/example_1.pdf")
topics_path = Path("../data/topics.json")

# 📄 Load topics
with open(topics_path, "r", encoding="utf-8") as f:
    topics = json.load(f)

# 📑 Load PDF pages (with layout for better line fidelity)
with pdfplumber.open(pdf_path) as pdf:
    pages = [page.extract_text(layout=True) for page in pdf.pages]


In [36]:
# 🎯 Sample 10 topics
sampled = random.sample(topics, 10)

# 📜 Gemini prompt template
VALIDATION_PROMPT = """
You are a legal analyst verifying whether the following topic begins at the given page and line in a deposition transcript.

Return only one of the following:
✅ MATCH
❌ NO MATCH

Topic to verify: {topic_name}
Claimed page number: {page_num}
Claimed line number: {line_num}

Transcript (Page {page_num}):
{page_text}
"""

In [37]:
# ✅ Validate topics
results = []
for topic in sampled:
    topic_name = topic.get("topic")
    page_num = topic.get("page_start")
    line_num = topic.get("line_start")

    # Skip incomplete entries
    if not topic_name or not page_num or not line_num:
        continue

    try:
        page_text = pages[page_num - 1]  # pages are 0-indexed
    except IndexError:
        continue

    prompt = VALIDATION_PROMPT.format(
        topic_name=topic_name,
        page_num=page_num,
        line_num=line_num,
        page_text=page_text
    )

    try:
        response = model.generate_content(prompt)
        result = response.text.strip().split("\n")[0]  # first line only
    except Exception as e:
        result = f"❌ Gemini Error: {str(e)}"

    results.append({
        "Topic": topic_name,
        "Page": page_num,
        "Line": line_num,
        "Gemini Review": result
    })

In [39]:
# 📊 Save and show results
df_results = pd.DataFrame(results)
os.makedirs("data", exist_ok=True)
df_results.to_csv("data/validation_results.csv", index=False)

# 🎯 Calculate accuracy
match_count = df_results["Gemini Review"].str.strip().eq("✅ MATCH").sum()
total = len(df_results)
accuracy = round((match_count / total) * 100, 2) if total else 0.0

print(f"\n✅ {match_count} / {total} topics matched")
print(f"🎯 Validation Accuracy: {accuracy}%")

df_results


✅ 0 / 10 topics matched
🎯 Validation Accuracy: 0.0%


Unnamed: 0,Topic,Page,Line,Gemini Review
0,Article Date,150,5,❌ Gemini Error: 429 You exceeded your current ...
1,Difficulty Securing Credit and Loan Renewals,29,20,❌ Gemini Error: 429 You exceeded your current ...
2,Bipartisan Farm Bill,7,57,❌ Gemini Error: 429 You exceeded your current ...
3,Adverse Effect Wage Rate (AEWR) for H-2A Program,126,67,❌ Gemini Error: 429 You exceeded your current ...
4,Rising Input Costs,28,19,❌ Gemini Error: 429 You exceeded your current ...
5,Commodity Prices and Export Markets,119,62,❌ Gemini Error: 429 You exceeded your current ...
6,Australian Trade Deals and Nut Exports,126,33,❌ Gemini Error: 429 You exceeded your current ...
7,Use of Data in Policy Decisions,91,83,❌ Gemini Error: 429 You exceeded your current ...
8,State of the Beef Cattle Industry,18,47,❌ Gemini Error: 429 You exceeded your current ...
9,Decline in Net Cash Farm Income for Crop Farmers,11,50,❌ Gemini Error: 429 You exceeded your current ...
