In [39]:
import statsapi
# from google.cloud import bigquery
from datetime import date, timedelta, datetime
import requests
import time
import sys
import os
import csv
import glob
from pprint import pprint
import sqlite3
import hashlib
import json

In [2]:
# File paths
PLAYER_STATS_FOLDER = 'stats/player/'
TEAM_STATS_FOLDER = 'stats/team/'

In [3]:


#Get today/yesterdays date 
today = date.today()
yesterday = today - timedelta(days = 1)
sunday = today - timedelta(days=today.weekday())
currentSeason = today.year
start_date = yesterday
end_date = yesterday

In [4]:
def player_stat_data(player_id, group=None, season=None, stats='season', gamePk=None):
    """
    Fetch player stats from the MLB Stats API.

    Args:
        player_id (int): The player ID.
        group (str): Stat group, e.g., 'hitting', 'pitching', or 'fielding'.
        season (int): The season year (required for 'season' stats).
        stats (str): Stat type. Either 'season' or 'gameLog'.
        gamePk (int): Optional. If provided with 'gameLog', filters to specific game splits.

    Returns:
        dict: A single stat split (for game or season), or None if not found.
    """
    url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
    
    params = {
        "stats": stats
    }
    
    if season:
        params["season"] = season
    if group:
        params["group"] = group
    if stats=="gameLog" and gamePk:
        params["gamePk"] = gamePk
        
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    
    if not data["stats"] or not data["stats"][0]["splits"]:
        return None

    splits = data["stats"][0]["splits"]

    if stats == "gameLog" and gamePk is not None:
        for split in splits:
            if split["game"]["gamePk"] == gamePk:
                return split
        return None  # GamePk not found

    # For 'season' or all game logs
    return splits[0] if splits else None


In [5]:
#Normalize name for pulling special fields from boxscore
def normalize_name(name_raw):
    if ',' in name_raw:
        last, first = [part.strip() for part in name_raw.split(',', 1)]
        return f"{first} {last}".strip()
    return name_raw

In [6]:
# Get schedule for start/end date (regular season)

games = []

schedule = statsapi.schedule(
    str(start_date),
    str(end_date),
    sportId=1
)
games = []
weather = []
print(len(schedule))

#Pull available game data from schedule
for game in schedule:
    gamePk = game['game_id']
    
    
    #Get weather
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    for item in boxscore['info']:
        if item['label'] == "Weather":
            weather = item['value']
        elif item['label'] == "Att":
            attendance = item['value']
        
    games.append({
        "gamePk": game['game_id'],
        "gametime": game['game_datetime'],
        "season": int(game['game_date'][:4]),
        "awayId": game['away_id'],
        "awayName": game['away_name'],
        "homeId": game['home_id'],
        "homeName": game['home_name'],
        "weather": weather,
        "attendance": attendance or None,
        "venue": game['venue_name']
    })   
    

15


In [7]:
teams = statsapi.get("teams", params = {"sportId":1})
raw_teams = teams['teams']
teams = []
for t in raw_teams:
    teams.append({
        'teamId': t['id'],
        'name': t['clubName'],
        'league': t['league']['name'],
        'venue': t['venue']['name'],
        'division': t['division']['name']
    })        
    

In [8]:
#Create hash PK for GAME_STATS table
def hash_game_stat(entity, stat, gamePk=None):
    key = f"{entity}:{stat}:{gamePk}" if gamePk is not None else f"{entity}:{stat}"
    return hashlib.sha256(key.encode('utf-8')).hexdigest()

game_stats = []

seen_players = set()
#Get team and player stats for games in games through boxscore endpoint
for g in games:
    gamePk = g['gamePk']
    gameDate = g['gametime'][:10]
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    for side in ["home", "away"]:
        team_stats = boxscore["teams"][side]["teamStats"]
        team_id = boxscore["teams"][side]['team']['id']
        # sys.exit()
        #Add team stats for games in schedule from boxscore
        for group in ['batting','pitching','fielding']:
            group_stats = team_stats.get(group, {})
            for stat, val in group_stats.items():
                game_stats.append({
                    "statsKey": hash_game_stat(team_id,stat,gamePk), 
                    "entity": team_id,
                    "scope": 'team',
                    "group": group,
                    "stat": stat, 
                    "val": val, 
                    "gamePk": gamePk,
                    "positionsPlayed": None
                })
            #print(f"Loaded {side} team data from game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
            
            #Player stats from boxscore 
            
            for player_key, player_data in boxscore["teams"][side]["players"].items():
                player_id = player_data["person"]["id"]  # ✅ This is the integer ID
                player_name = player_data["person"]["fullName"] #Thanks chat 
                position = player_data["position"]["abbreviation"]
                
                for group in ["batting", "fielding", "pitching"]:
                    group_stats = player_data["stats"].get(group, {})
                    if len(group_stats) == 0: #If player did not hit or pitch or field, no stats
                        continue
                    for stat, val in group_stats.items():
                        game_stats.append({
                            "statsKey": hash_game_stat(player_id,stat,gamePk),
                            "entity":player_id,
                            "scope": 'player',
                            "group":group,
                            "stat":stat, 
                            "val":val, 
                            "gamePk":gamePk,
                            "positionsPlayed": position
                        })
                #print(f"Loaded player data for {player_name} {player_id} in game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
                seen_players.add(player_id)
len_players = len(seen_players)

In [9]:
#Create hash PK for season_stats table
def hash_szn_stat(entity, stat, group, season=None):
    key = f"{entity}:{stat}:{gamePk}:{group}" if gamePk is not None else f"{entity}:{stat}:{group}"
    
    return hashlib.sha256(key.encode('utf-8')).hexdigest()


In [10]:


#Get player season stats
season_stats = []

len_players = len(seen_players)
seen_players = set()
for stats in game_stats:
    if stats['scope'] != 'player' or stats['entity'] in seen_players:
        continue
    else:
        playerId = stats['entity']
    i_players = len(seen_players)

    
    person = statsapi.get("person", params={"personId": playerId})
    
    for group in ["hitting", "pitching", "fielding"]:
        data = player_stat_data(playerId, currentSeason)
        
        for stat, val in data['stat'].items():
            stat_key = hash_szn_stat(playerId, stat, group)
            season_stats.append({
                "statsKey": stat_key,
                "entity": playerId,
                "scope": "player",
                "group": group,
                "stat":stat, 
                "val":val,  
                "season":currentSeason,
                "updateDate":today,
            })
    
    seen_players.add(playerId)
    # print(f"Got {currentSeason} season stats for {playerId} ({i_players} out of {len_players})")
            

    
    

In [11]:

#Team season stats


def team_stat_data(team_id, group, season):
    url = f"https://statsapi.mlb.com/api/v1/teams/{team_id}/stats"
    params = {
        "season": season,
        "group": group,
        "stats": "season",
        "updateDate": today
    }
    r = requests.get(url, params=params)
    r.raise_for_status()
    return r.json()["stats"][0]["splits"][0]


hash_szn_stats = []
for team in teams:
    teamId = team['teamId']
    teamName = team['name']
    for group in ["hitting","pitching","fielding"]:
    
        team_season_stats = team_stat_data(teamId, group, currentSeason)
        if 'stat' in team_stat_data(teamId, group, currentSeason):
            for stat, val in team_season_stats['stat'].items():         
                stat_key = hash_szn_stat(teamId,stat,currentSeason)
                season_stats.append({
                    "statsKey": stat_key,
                    "entity": teamId,
                    "scope": "team",
                    "group": group,
                    "stat": stat,
                    "val": val,
                    "season":currentSeason,
                    "updateDate": today
                })
            print(f"Stored team {group} season stats for {teamName}")
                
            
            


Stored team hitting season stats for Athletics
Stored team pitching season stats for Athletics
Stored team fielding season stats for Athletics
Stored team hitting season stats for Pirates
Stored team pitching season stats for Pirates
Stored team fielding season stats for Pirates
Stored team hitting season stats for Padres
Stored team pitching season stats for Padres
Stored team fielding season stats for Padres
Stored team hitting season stats for Mariners
Stored team pitching season stats for Mariners
Stored team fielding season stats for Mariners
Stored team hitting season stats for Giants
Stored team pitching season stats for Giants
Stored team fielding season stats for Giants
Stored team hitting season stats for Cardinals
Stored team pitching season stats for Cardinals
Stored team fielding season stats for Cardinals
Stored team hitting season stats for Rays
Stored team pitching season stats for Rays
Stored team fielding season stats for Rays
Stored team hitting season stats for Rang

In [40]:
def is_valid_pitch(event):
    event_type = event.get("type")
    is_pitch_type = event_type == 'pitch'
    return is_pitch_type

def pbp_hash_key(gamePk, index, atBatIndex=None):
    key = f"{gamePk}:{index}:{atBatIndex}" if atBatIndex is not None else f"{gamePk}:{index}"
    return hashlib.sha256(key.encode('utf-8')).hexdigest()


at_bats = []
all_sequences = []
#Games (list of str)
for game in games[0]:
    pbp = statsapi.get("game_playByPlay", params={"gamePk": gamePk})
    all_plays = pbp['allPlays']
    game_sequence = []
    for atbat in all_plays:
        sequence = []
        index = atbat['atBatIndex']
        batter=atbat["matchup"]["batter"]["id"]
        pitcher= atbat["matchup"]["pitcher"]["id"]
        atbat_key = pbp_hash_key(gamePk, index)
        at_bats.append({
            "atbatKey": atbat_key,
            "gamePk": gamePk,
            "gameIndex": index,
            'batterId': batter,
            'pitcherId': pitcher,
            'splits': json.dumps(atbat['matchup']['splits']),
            'inning': atbat['about']['inning'],
            'result': atbat['result']['eventType'],
            "batterSide": atbat['matchup']['batSide']['code'],
            "pitcherHand": atbat['matchup']['pitchHand']['code']

            })
        # season_stats = player_stat_data(pitcher, group = "pitching", season = 2025)
        
        for event in atbat.get("playEvents", []):
            
            if not is_valid_pitch(event):
                continue
            else:
                # Extract relevant pitch info
                pitch_number = event.get("pitchNumber")
                count = event.get("count", {})
                pitch_key = pbp_hash_key(gamePk,atbat_key,pitch_number)
                # Append to context
                sequence.append({
                    "pitchKey": pitch_key,
                    "atbatKey": atbat_key,
                    "pitchNumber": pitch_number,
                    "batterId": batter,
                    "pitcherId": pitcher,
                    "balls": count.get("balls", 0),
                    "strikes": count.get("strikes", 0),
                    "outs": count.get("outs", 0),
                    "pitchResult": event.get("details", {}).get("description", {}),
                    "pitchType": event.get("details", {}).get("type", {}).get("description", ""),
                    "speed": event.get("pitchData", {}).get("startSpeed", None),
                    "typeConfidence":event.get("pitchData",{}).get('typeConfidence',{}),
                    "spinRate": event.get("pitchData",{}).get('breaks',{}).get('spinRate',{}),
                    "spinDirection": event.get("pitchData",{}).get('breaks',{}).get('spinDirection',{})
                })
        all_sequences.append(sequence)


In [44]:
with sqlite3.connect('database/mlb_db.db', timeout = 10) as conn:
    cursor = conn.cursor()

    cursor.execute("DROP TABLE IF EXISTS GAMES;")
    
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS GAMES (
    gamePk INTEGER PRIMARY KEY,
    gametime TEXT,
    season INTEGER, 
    awayId INTEGER,
    homeId INTEGER,
    weather TEXT,
    attendance TEXT,
    venue TEXT
    );
    """)
    
    cursor.executemany("""
    INSERT OR IGNORE INTO GAMES (gamePk, gametime, season, awayId, homeId, weather, attendance, venue)
        VALUES (:gamePk, :gametime, :season, :awayId, :homeId, :weather, :attendance, :venue)
        ;
    """, games)
    cursor.execute("DROP TABLE IF EXISTS TEAMS;")
    cursor.execute("""            
    CREATE TABLE IF NOT EXISTS TEAMS (
    teamId INTEGER PRIMARY KEY,
    teamName TEXT,
    league TEXT,
    venue TEXT,
    division TEXT
    );
    """)
    
    conn.commit()
    cursor.executemany("""
    INSERT OR IGNORE INTO TEAMS (teamId, teamName, league, venue, division) 
    VALUES (:teamId, :name, :league, :venue, :division)
    ;
    """, teams)
    
    conn.commit() 
    cursor.execute("DROP TABLE IF EXISTS GAME_STATS;")
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS GAME_STATS (
    statsKey TEXT PRIMARY KEY,
    entity TEXT,
    scope BINARY, 
    "group" TEXT, --GROUP is SQL keyword
    stat TEXT,
    val TEXT,
    gamePk INTEGER, 
    positionsPlayed TEXT
    );
    """)
    
    cursor.executemany("""
    INSERT OR IGNORE INTO GAME_STATS (statsKey, entity, scope, "group", stat, val, gamePk, positionsPlayed)
    VALUES(:statsKey, :entity, :scope, :group, :stat, :val, :gamePk, :positionsPlayed)
    ;
    """, game_stats)
    conn.commit()
    cursor.execute("DROP TABLE IF EXISTS SEASON_STATS;")
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS SEASON_STATS (
    statsKey TEXT PRIMARY KEY,
    entity TEXT,
    scope TEXT,
    "group" TEXT,
    stat TEXT,
    val TEXT,
    season INTEGER,
    updateDate DATETIME
    );
    """)
    conn.commit()

    cursor.executemany("""
    INSERT OR IGNORE INTO SEASON_STATS (statsKey, entity, scope, "group", stat, val, season, updateDate)
    VALUES(:statsKey, :entity, :scope, :group, :stat, :val, :season, :updateDate)
    ;
    """, season_stats)
    conn.commit()
    
    cursor.execute("DROP TABLE IF EXISTS AT_BATS;")
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS AT_BATS (
    atbatKey TEXT PRIMARY KEY,
    gameIndex INTEGER,
    batterId INTEGER,
    pitcherId INTEGER, 
    splits TEXT,
    inning INTEGER,
    result TEXT,
    batterSide TEXT,
    pitcherHand TEXT
    );
    """)
    conn.commit()

    cursor.executemany("""
    INSERT OR IGNORE INTO AT_BATS (atbatKey, gameIndex, batterId, pitcherId, splits, inning, result, batterSide, pitcherHand)
    VALUES(:atbatKey, :gameIndex, :batterId, :pitcherId, :splits, :inning, :result, :batterSide, :pitcherHand)
    ;
    """, at_bats)

        
    cursor.execute("DROP TABLE IF EXISTS AT_BAT_SEQUENCE;")
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS AT_BAT_SEQUENCE (
    pitchKey TEXT PRIMARY KEY,
    atbatKey TEXT,
    pitchNumber INTEGER,
    batterId TEXT,
    pitcherId TEXT,
    balls INTEGER,
    strikes INTEGER,
    outs INTEGER,
    pitchResult TEXT,
    pitchType TEXT,
    speed REAL,
    typeConfidence REAL,
    spinRate INTEGER,
    spinDirection INTEGER
    );
    """)
    for sequence in all_sequences:
        cursor.executemany("""
        INSERT OR IGNORE INTO AT_BAT_SEQUENCE (pitchKey, atbatKey, pitchNumber, batterId, pitcherId, balls, strikes, outs, pitchResult, pitchType, speed, typeConfidence, spinRate, spinDirection)
        VALUES (:pitchKey, :atbatKey, :pitchNumber, :batterId, :pitcherId, :balls, :strikes, :outs, :pitchResult, :pitchType, :speed, :typeConfidence, :spinRate, :spinDirection)
        """, sequence)
    
    conn.commit()

In [46]:
# Test queries 
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())


# cursor.execute("SELECT ab.inning, balls, strikes FROM AT_BAT_SEQUENCE ABQ JOIN AT_BATS AB ON AB.atbatKey = ABQ.atbatKey LIMIT 5;")
cursor.execute("SELECT * FROM AT_BAT_SEQUENCE LIMIT 5;")
pprint(cursor.fetchall())

conn.close()

[('GAMES',), ('TEAMS',), ('GAME_STATS',), ('SEASON_STATS',), ('AT_BATS',), ('AT_BAT_SEQUENCE',)]
[('f3fadb8b5f88f45093193c08485ec53c97f886cd869cb0f9518d4150b130ba41',
  '32b3f2af29adb293dec4eab05a38b0fe956fa5a54959e1c0032dce372aed7513',
  1,
  '670623',
  '688138',
  0,
  1,
  0,
  'Swinging Strike',
  'Sinker',
  93.7,
  0.93,
  1947,
  242),
 ('70023283b06e86ddde9e573e1624903abb8e82ffe08d4d11e456272053a682a4',
  '32b3f2af29adb293dec4eab05a38b0fe956fa5a54959e1c0032dce372aed7513',
  2,
  '670623',
  '688138',
  0,
  2,
  0,
  'Called Strike',
  'Cutter',
  89.1,
  0.89,
  2416,
  185),
 ('f02342e6221763146fa7bd2b68307a482fccb63cfea7da6268f01640803531e7',
  '32b3f2af29adb293dec4eab05a38b0fe956fa5a54959e1c0032dce372aed7513',
  3,
  '670623',
  '688138',
  0,
  2,
  0,
  'In play, out(s)',
  'Sweeper',
  86.2,
  0.88,
  2876,
  51),
 ('8d7db4e1111bb77d11b4f8321bcfbdd247b5dd98b89fe558f06bd89ac3e0cdf8',
  '62cba0d658ba0b62c1098dfc25470f3e8b1a73bcf20f9efbf82989ecd716a70a',
  1,
  '514888',
 

In [None]:

examples = []
sequence = []
def build_pitch_context(atbat_data,season_stats):
    for pitch in atbat_data:
        current_pitch = pitch.get("pitchType")
    
        # Only create example if there's at least one prior pitch in the game
        if sequence:
            examples.append({
                "X": sequence.copy(),  # Copy to prevent mutation
                "y": current_pitch,
                "context": {
                    "balls": pitch["balls"],
                    "strikes": pitch["strikes"],
                    "outs": pitch["outs"],
                    "batter_side": pitch["batter_side"],
                    "pitcher_hand": pitch["pitcher_hand"],
                    "pitches_per_inning": pitch["pitches_per_inning"].c
                }
            })
    
        sequence.append(current_pitch) #First pitch will have no context

