In [None]:
import pandas as pd
import json 

In [None]:
data = pd.read_csv("../data/nfl_mahomes_era_games.csv")
team_names = json.load(open("../data/names.json", "r"))

In [None]:
data.head()

In [None]:
data['home_team'] = data['home_team'].map(lambda x: team_names[x])
data['away_team'] = data['away_team'].map(lambda x: team_names[x])

In [None]:
teams = list(team_names.values())

In [None]:
import itertools

pairs = list(itertools.combinations(teams, 2))

In [None]:
teamwise_data = {}
for teamA, teamB in pairs:
    counts = {
        teamA: 0,
        teamB: 0,
        "drawn": 0
    }
    cond1 = (data['home_team'] == teamA) & (data['away_team'] == teamB)
    cond2 = (data['home_team'] == teamB) & (data['away_team'] == teamA)
    matches = data[cond1 | cond2]
    outcomes = []
    for tup in matches[['season', 'home_team', 'away_team', 'game_outcome']].itertuples():
        if tup.game_outcome == 0.0:
            counts[tup.away_team] += 1
            outcomes.append((tup.season, f"{tup.away_team} won"))
        elif tup.game_outcome == 1.0:
            counts[tup.home_team] += 1
            outcomes.append((tup.season, f"{tup.home_team} won"))
        else:
            counts["drawn"] += 1
            outcomes.append((tup.season, f"Draw"))
            
    teamwise_data[(teamA, teamB)] = {
        "matches_played": matches.shape[0],
        "outcome": outcomes,
        "counts": counts
    }
    
    # cond1 = (data['home_team'] == teamA) & (data['away_team'] != teamB)
    # cond2 = (data['home_team'] == teamB) & (data['away_team'] != teamA)
    # cond3 = (data['away_team'] == teamA) & (data['home_team'] != teamB)
    # cond4 = (data['away_team'] == teamB) & (data['home_team'] != teamA)
    # data[cond1 | cond2 | cond3 | cond]

In [None]:
teamwise_data = sorted(
    teamwise_data.items(),
    key=lambda item: item[1]['matches_played'],
    reverse=True
)

In [None]:
teamwise_data[0]

In [None]:
import os 
from dotenv import load_dotenv
from google import genai
from google.genai import types 
from tqdm import tqdm 


In [None]:
system_inst = '''You are a sports analyst. Your task is to estimate the probability (between 0 and 1) that TeamA will win against TeamB based on their previous match outcomes.
    
Output only a valid JSON object in the following format:
    { "probability": <float between 0 and 1, rounded to at most 3 decimal places> }
        
Do not include any explanation, reasoning, or additional text.
'''
def prompt(formatted_preference_data):
    load_dotenv()
    google_api_key = os.getenv("GOOGLE_API_KEY")
    if google_api_key is None:
        raise ValueError("GOOGLE_API_KEY not found in .env file")

    client = genai.Client(api_key=google_api_key)
    model = "gemini-2.0-flash"

    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=formatted_preference_data)],
        )
    ]

    config = types.GenerateContentConfig(
        temperature=0.0,
        response_mime_type="application/json",
        system_instruction=[types.Part.from_text(text=system_inst)],
    )

    response_text = ""
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=config,
    ):
        response_text += chunk.text

    try:
        output = json.loads(response_text)
        return output
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON response from model: {e}")

In [None]:
def create_usr_prompt(data, reverse=False):
    teamA, teamB = data[0]
    outcomes = data[1]['outcome'] # list (year, outcome)
    
    if reverse:
        usr_prompt = f"TeamA: {teamB}\nTeamB: {teamA}\nMatch History:\n"
    else:
        usr_prompt = f"TeamA: {teamA}\nTeamB: {teamB}\nMatch History:\n"
    for year, outcome in outcomes:
        usr_prompt += f"\t- {year}: {outcome}.\n"
        
    return usr_prompt

In [None]:
# import time 
# from tqdm import tqdm
# results = []
# for td in tqdm(teamwise_data):
#     teamA, teamB = td[0]
#     if td[1]['matches_played'] >= 5:
#         probA = prompt(create_usr_prompt(td))
#         probB = prompt(create_usr_prompt(td, reverse=True))
#         time.sleep(30)
        
#         estimate_probA = td[1]['counts'][teamA] / td[1]['matches_played']
#         estimate_probB = td[1]['counts'][teamB] / td[1]['matches_played']
        
#         results.append((teamA, teamB, probA, probB, estimate_probA, estimate_probB))
        

In [None]:
import time
import csv
from tqdm import tqdm

results_file = "results.csv"
with open(results_file, mode='a', newline='') as f:
    writer = csv.writer(f)

    writer.writerow(["TeamA", "TeamB", "LLM_ProbA", "LLM_ProbB", "Est_ProbA", "Est_ProbB"])

    for td in tqdm(teamwise_data):
        teamA, teamB = td[0]
        if td[1]['matches_played'] >= 5:
            try:
                probA = prompt(create_usr_prompt(td))
                probB = prompt(create_usr_prompt(td, reverse=True))
                time.sleep(30)
                
                probA = json.loads(probA)['probability']
                probB = json.loads(probB)['probability']

                estimate_probA = td[1]['counts'][teamA] / td[1]['matches_played']
                estimate_probB = td[1]['counts'][teamB] / td[1]['matches_played']

                writer.writerow([teamA, teamB, probA, probB, estimate_probA, estimate_probB])
                f.flush()

            except Exception as e:
                print(f"Error for {teamA} vs {teamB}: {e}")