Check cfb dataset for validity by random sampling a subset and checking with Gemini LLM

In [52]:
import time
import pandas as pd
from google import genai
from google.genai import types
import csv
import random
from datetime import datetime

MODEL_NAME = "gemini-2.5-flash"
SAMPLE_RATE = 1000 

CFB_DATA_PATH = "../../college_football_data/cfb_box-scores_2002-2024_20251006_193644.csv"
VALIDATION_RESULTS_CSV = f"../../intermediate_files/validation_results/cfb_validation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"


Configure LLM Calls

In [None]:
API_KEY = ""
MODEL_NAME = "gemini-2.5-flash"
SAMPLE_RATE = 1000 
DELAY = 5 # seconds 

In [54]:
prompt = f"""Determine if this is a valid college football game that occurred.
        Return your answer as one of:
        - "VALID" - This game data (date, teams, score) is valid
        - "INVALID" - This data does not look like a valid college football game
        - "UNCERTAIN" - The data is inconclusive or has some irregularities but could be valid

        Provide a brief explanation (1-2 sentences) for your decision, noting if you used search to verify information.

        Format your response as:
        DECISION: [VALID/INVALID/UNCERTAIN]
        EXPLANATION: [Your brief explanation]
        """

In [55]:
cfb_df = pd.read_csv(CFB_DATA_PATH)
total_games = len(cfb_df)
sample_indices = random.sample(range(total_games), total_games // SAMPLE_RATE)
print("sampling at "+ str(sample_indices))

sampling at [435, 4681, 11979, 6874, 10185, 10772, 11393, 1822, 3150, 3966, 9701, 9751, 17496, 559, 16107, 7283, 3264, 10900]


Iterate over sampling locations, send to LLM for verification

In [56]:
results = []
for i, idx in enumerate(sample_indices):
    game_row = cfb_df.iloc[idx]

    try:
        client = genai.Client(api_key=API_KEY)

        grounding_tool = types.Tool(google_search=types.GoogleSearch())
        config = types.GenerateContentConfig(tools=[grounding_tool])
        prompt_and_data = prompt + "\n Data to validate: " + str(game_row)
        resp = client.models.generate_content(
            model=MODEL_NAME,
            contents=prompt_and_data,
            config=config,
        )

        api_result = resp.text.strip()

        if resp.candidates:
            grounding_metadata = resp.candidates[0].grounding_metadata
            used_search = bool(grounding_metadata)
        else:
            used_search = False

    except Exception as e:
        print("error+ " + str(e))
        api_result = "API_ERROR"

    if api_result == "API_ERROR":
        decision = "API_ERROR"
        explanation = "API call failed"
    else:
        lines = api_result.split('\n')
        decision = "UNCERTAIN"  
        explanation = "Could not parse response"

        for line in lines:
            line = line.strip()
            if line.upper().startswith('DECISION:'):
                decision_part = line.split(':', 1)[1].strip().upper()
                if 'VALID' in decision_part:
                    decision = 'VALID'
                elif 'INVALID' in decision_part:
                    decision = 'INVALID'
                elif 'UNCERTAIN' in decision_part:
                    decision = 'UNCERTAIN'
            elif line.upper().startswith('EXPLANATION:'):
                explanation = line.split(':', 1)[1].strip()

    explanation = explanation.replace(r'\n', ' ')
    result_record = {
        'date': game_row['date'],
        'away': game_row['away'],
        'home': game_row['home'],
        'validation_result': decision,
        'explanation': explanation,
        'raw_response': api_result,
        'processed_date': datetime.now().isoformat(),
        'used_search': used_search
    }
    print(result_record['date'])
    print(result_record['away'] + " @ " + result_record['home'])
    print(result_record['validation_result'])
    print()
    time.sleep(5)

    results.append(result_record)


2002-10-19
Troy @ Marshall
VALID

2008-09-06
Texas State @ SMU
VALID

2016-11-25
Cincinnati @ Tulsa
VALID

2010-11-26
UCLA @ Arizona State
VALID

2014-11-22
Rutgers @ Michigan State
VALID

2015-10-17
Idaho @ Troy
VALID

2016-09-17
UNLV @ Central Michigan
VALID

2004-10-02
Penn State @ Minnesota
VALID

2006-09-16
Furman @ North Carolina
VALID

2007-09-22
Ball State @ Nebraska
VALID

2014-09-20
Hawaii @ Colorado
VALID

2014-09-27
Iowa @ Purdue
VALID

2023-10-07
Colorado @ Arizona State
VALID

2002-11-09
Notre Dame @ Navy
VALID

2021-11-27
Hawaii @ Wyoming
VALID

2011-10-01
Ball State @ Oklahoma
VALID

2006-09-30
Northern Iowa @ Iowa State
VALID

2015-10-31
Michigan @ Minnesota
VALID



Save results

In [57]:
csvfile = open(VALIDATION_RESULTS_CSV, 'w', newline='', encoding='utf-8') 
fieldnames = ['date', 'away', 'home', 'validation_result', 'explanation', 'raw_response', 'processed_date', 'used_search']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in results:
    writer.writerow(result)
csvfile.close()