Check cfb dataset for validity by random sampling a subset and checking with Gemini LLM

In [None]:
import time
import pandas as pd
import csv
import random
from datetime import datetime

import sys
sys.path.append('..')
from utils.llm_utils import call_gemini_api

CFB_DATA_PATH = "../../college_football_data/cfb_box-scores_2002-2024_20251006_193644.csv"
VALIDATION_RESULTS_CSV = f"../../intermediate_files/validation_results/cfb_validation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"


Configure LLM Calls

In [3]:
SAMPLE_RATE = 1000 
DELAY = 5 # seconds 

In [4]:
prompt = f"""Determine if this is a valid college football game that occurred.
        Return your answer as one of:
        - "VALID" - This game data (date, teams, score) is valid
        - "INVALID" - This data does not look like a valid college football game
        - "UNCERTAIN" - The data is inconclusive or has some irregularities but could be valid

        Provide a brief explanation (1-2 sentences) for your decision, noting if you used search to verify information.

        Format your response as:
        DECISION: [VALID/INVALID/UNCERTAIN]
        EXPLANATION: [Your brief explanation]
        """

In [5]:
cfb_df = pd.read_csv(CFB_DATA_PATH)
total_games = len(cfb_df)
sample_indices = random.sample(range(total_games), total_games // SAMPLE_RATE)
print("sampling at "+ str(sample_indices))

sampling at [14767, 3652, 6487, 12948, 5977, 14756, 18603, 18456, 10793, 10865, 4531, 6115, 18648, 9692, 12364, 17642, 11008, 9215]


Iterate over sampling locations, send to LLM for verification

In [6]:
results = []
for i, idx in enumerate(sample_indices):
    game_row = cfb_df.iloc[idx]
    
    # Use the shared function instead of inline API calls
    prompt_and_data = prompt + "\n Data to validate: " + str(game_row)
    api_result, used_search = call_gemini_api(prompt_and_data)
    
    # ... existing response parsing logic remains the same ...
    if api_result == "API_ERROR":
        decision = "API_ERROR"
        explanation = "API call failed"
    else:
        lines = api_result.split('\n')
        decision = "UNCERTAIN"  
        explanation = "Could not parse response"

        for line in lines:
            line = line.strip()
            if line.upper().startswith('DECISION:'):
                decision_part = line.split(':', 1)[1].strip().upper()
                if 'VALID' in decision_part:
                    decision = 'VALID'
                elif 'INVALID' in decision_part:
                    decision = 'INVALID'
                elif 'UNCERTAIN' in decision_part:
                    decision = 'UNCERTAIN'
            elif line.upper().startswith('EXPLANATION:'):
                explanation = line.split(':', 1)[1].strip()

    explanation = explanation.replace(r'\n', ' ')
    result_record = {
        'date': game_row['date'],
        'away': game_row['away'],
        'home': game_row['home'],
        'validation_result': decision,
        'explanation': explanation,
        'raw_response': api_result,
        'processed_date': datetime.now().isoformat(),
        'used_search': used_search
    }
    print(result_record['date'])
    print(result_record['away'] + " @ " + result_record['home'])
    print(result_record['validation_result'])
    print()
    time.sleep(5)

    results.append(result_record)


2020-09-19
UCF @ Georgia Tech
VALID

2006-11-18
Virginia Tech @ Wake Forest
VALID

2010-10-09
Western Michigan @ Ball State
VALID

2017-12-30
Washington @ Penn State
VALID

2009-11-14
UTEP @ SMU
VALID

2020-09-12
UTEP @ Texas
VALID

2024-11-02
Wyoming @ New Mexico
VALID

2024-10-17
Georgia State @ Marshall
VALID

2015-10-17
UCF @ Temple
VALID

2015-10-30
Louisiana Tech @ Rice
VALID

2007-12-29
UCF @ Mississippi State
VALID

2009-12-05
Houston @ East Carolina
VALID

2024-11-09
South Carolina @ Vanderbilt
VALID

2014-09-20
Troy @ Georgia
VALID

2017-09-23
Washington @ Colorado
VALID

2023-10-28
Mississippi State @ Auburn
VALID

2015-11-14
Michigan @ Indiana
VALID

2013-11-09
Auburn @ Tennessee
VALID



Save results

In [7]:
csvfile = open(VALIDATION_RESULTS_CSV, 'w', newline='', encoding='utf-8') 
fieldnames = ['date', 'away', 'home', 'validation_result', 'explanation', 'raw_response', 'processed_date', 'used_search']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in results:
    writer.writerow(result)
csvfile.close()