In [3]:
import json
import random
from pathlib import Path

# Paths
topics_path = Path("../data/topics.json")
pdf_path = Path("../data/example_1.pdf")

# Load topics
with open(topics_path, "r", encoding="utf-8") as f:
    topics = json.load(f)

# Sample 10 topics
sampled_topics = random.sample(topics, 10)
sampled_topics[:2]  # Preview 2 rows


[{'topic': 'State of the Dairy Industry', 'page_start': 18, 'line_start': 1},
 {'topic': 'Impact of Specialty Crops on National Security',
  'page_start': 31,
  'line_start': 47}]

In [4]:
import pdfplumber

def get_line_window_from_pdf(pdf_path, page_num, line_num, window=5):
    with pdfplumber.open(pdf_path) as pdf:
        if page_num - 1 >= len(pdf.pages):
            return f"⚠️ Page {page_num} not found in PDF."
        page = pdf.pages[page_num - 1]
        text = page.extract_text()
        if not text:
            return f"⚠️ No text found on page {page_num}."
        lines = text.splitlines()
        start = max(0, line_num - window - 1)
        end = min(len(lines), line_num + window)
        excerpt = "\n".join(lines[start:end])
        return excerpt


In [5]:
validation_data = []

for entry in sampled_topics:
    topic = entry["topic"]
    page = entry["page_start"]
    line = entry["line_start"]
    excerpt = get_line_window_from_pdf(pdf_path, page, line)
    
    validation_data.append({
        "topic": topic,
        "page": page,
        "line": line,
        "excerpt": excerpt
    })

import pandas as pd
pd.DataFrame(validation_data)


Unnamed: 0,topic,page,line,excerpt
0,State of the Dairy Industry,18,1,14\nFMD—providing increased opportunities for ...
1,Impact of Specialty Crops on National Security,31,47,functional safety net for the very first time....
2,Competitiveness and Sustainability Principle,33,41,ance that would provide the majority of our gr...
3,Competition from Imports,30,34,The economic and disaster assistance from the ...
4,Agriculture Risk Coverage (ARC) and Price Loss...,17,55,
5,Trump Administration's COVID-19 Assistance Ini...,32,5,28\nIntroduction\nAlthough not directly under ...
6,Crop Protection Tools Access,30,32,have worked for. This includes the ability to ...
7,Diversification of Farm Income,29,24,"volatile markets. Our food system, rural commu..."
8,Crop Insurance Coverage vs. Input Costs,17,44,"13United States Trade Representative, ‘‘United..."
9,SCRI Funding Recommendations,46,13,ing or Farm Labor Housing projects on behalf o...


In [7]:
import google.generativeai as genai
import os
# Set up Gemini (make sure your API key is in environment)
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-1.5-flash")


In [8]:
def validate_topic_with_gemini(topic, excerpt):
    prompt = f"""
You are an AI legal reviewer.

Here is a snippet from a deposition transcript. Based on this, decide whether the topic "{topic}" is clearly introduced or discussed in the text.

Text:
\"\"\"
{excerpt}
\"\"\"

Instructions:
- Think step by step.
- Justify your answer in plain language.
- Then return one of the following verdicts at the end:
    ✅ YES - Topic is valid
    ❌ NO - Topic is not mentioned clearly

Respond in this format:
Reasoning: ...
Verdict: ...
"""
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"❌ Error: {e}"


In [10]:
results = []
for item in validation_data:
    result = validate_topic_with_gemini(item["topic"], item["excerpt"])
    results.append({
        "Topic": item["topic"],
        "Page": item["page"],
        "Line": item["line"],
        "Excerpt": item["excerpt"],
        "Gemini Review": result
    })

df_results = pd.DataFrame(results)
df_results.to_csv("../data/validation_results.csv", index=False)
df_results


Unnamed: 0,Topic,Page,Line,Excerpt,Gemini Review
0,State of the Dairy Industry,18,1,14\nFMD—providing increased opportunities for ...,Reasoning: The provided text focuses on agricu...
1,Impact of Specialty Crops on National Security,31,47,functional safety net for the very first time....,Reasoning: The provided text is an introductio...
2,Competitiveness and Sustainability Principle,33,41,ance that would provide the majority of our gr...,Reasoning: The provided text focuses on propos...
3,Competition from Imports,30,34,The economic and disaster assistance from the ...,Reasoning: The provided text focuses on the ne...
4,Agriculture Risk Coverage (ARC) and Price Loss...,17,55,,Reasoning: The provided text is empty. Theref...
5,Trump Administration's COVID-19 Assistance Ini...,32,5,28\nIntroduction\nAlthough not directly under ...,Reasoning: The provided text excerpt is an int...
6,Crop Protection Tools Access,30,32,have worked for. This includes the ability to ...,Reasoning: The provided deposition transcript ...
7,Diversification of Farm Income,29,24,"volatile markets. Our food system, rural commu...",Reasoning: The provided text is an introductio...
8,Crop Insurance Coverage vs. Input Costs,17,44,"13United States Trade Representative, ‘‘United...",Reasoning: The provided text is a list of URLs...
9,SCRI Funding Recommendations,46,13,ing or Farm Labor Housing projects on behalf o...,Reasoning: The provided text discusses funding...


In [11]:
# Count accurate entries based on Gemini verdict
valid_count = sum("✅" in r["Gemini Review"] for r in results)
total = len(results)
accuracy = (valid_count / total) * 100

print(f"✅ {valid_count} / {total} topics matched")
print(f"🎯 Validation Accuracy: {accuracy:.2f}%")


✅ 0 / 10 topics matched
🎯 Validation Accuracy: 0.00%
