In [12]:
import requests
from datetime import date, timedelta, datetime
import time
import sys
import statsapi
from pprint import pprint
import json
import os
import csv
# from google.cloud import bigquery



In [2]:
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.expanduser("~/Documents/jupyter/MLB/credentials/mlb-analysis-463501-869e729270f3.json")

# client = bigquery.Client()
# print("Project:", client.project)

Project: mlb-analysis-463501


In [14]:
games = []
seen_games = set()

In [133]:
#Helper function for team stats by season
def team_stat_data(team_id, group, season, stats='season'):
    url = f"https://statsapi.mlb.com/api/v1/teams/{team_id}/stats"
    params = {
        "season": season,
        "group": group,
        "stats": stats
    }
    r = requests.get(url, params=params)
    r.raise_for_status()
    return r.json()["stats"][0]["splits"][0]

In [150]:
teams = statsapi.get("teams",{"sportId": 1})['teams']
currentSeason = 2025
team_season_stats = []
for team in teams:
    teamId = team['id']
    teamName = team['name']
    for group in ["hitting","pitching","fielding"]:
        season_stats = team_stat_data(teamId, group, currentSeason, stats='season')
        # Check if the stat key exists and is a dictionary
        if 'stat' in season_stats and isinstance(season_stats['stat'], dict):
            for key, val in season_stats['stat'].items():            
                team_season_stats.append({
                    "statType": f"team_season_{group}",
                    "stat": key,
                    "val": val,
                    "season": currentSeason,
                    "teamId": teamId
                })
            print(f"Stored team {group} stats for {teamName}")
                
            
            
with open(f"team_stats/{currentSeason}_season.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=team_season_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(team_season_stats)  # Step 4: Write rows            
        

Stored team hitting stats for Athletics
Stored team pitching stats for Athletics
Stored team fielding stats for Athletics
Stored team hitting stats for Pittsburgh Pirates
Stored team pitching stats for Pittsburgh Pirates
Stored team fielding stats for Pittsburgh Pirates
Stored team hitting stats for San Diego Padres
Stored team pitching stats for San Diego Padres
Stored team fielding stats for San Diego Padres
Stored team hitting stats for Seattle Mariners
Stored team pitching stats for Seattle Mariners
Stored team fielding stats for Seattle Mariners
Stored team hitting stats for San Francisco Giants
Stored team pitching stats for San Francisco Giants
Stored team fielding stats for San Francisco Giants
Stored team hitting stats for St. Louis Cardinals
Stored team pitching stats for St. Louis Cardinals
Stored team fielding stats for St. Louis Cardinals
Stored team hitting stats for Tampa Bay Rays
Stored team pitching stats for Tampa Bay Rays
Stored team fielding stats for Tampa Bay Rays

In [41]:
#Get today/yesterdays date 
today = date.today()
yesterday = today - timedelta(days = 1)
yesterday = yesterday.isoformat()
all_teams = '2025-07-03'

2025-07-03


In [37]:
#PARAMETERS: start/end date
start_date = yesterday
end_date = yesterday
print(start_date)
print(end_date)

2025-07-03
2025-07-03


In [47]:
# # Get schedule for start/end date
schedule = statsapi.schedule(start_date, end_date, sportId=1, leagueId=104)
games = []
weather = []
seen_games = set()

#Pull only attribute in list into new array and add relevant fields
#gamePks = [game['game_id'] for game in schedule]
pitching_stats = []
#Pull available game data from schedule
#Each new dictionary counts as one row 
for game in schedule:
    gamePk = game['game_id']
    if gamePk in seen_games:
        continue
    #Get weather
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    # print(boxscore.keys())
    for item in boxscore['info']:
        if item['label'] == "Weather":
            weather = item['value']
        elif item['label'] == "Att":
            attendance = item['value']
    # sys.exit()
    games.append({
        "gamePk": game['game_id'],
        "gametime": game['game_datetime'],
        "awayId": game['away_id'],
        "homeId": game['home_id'],
        "weather": weather,
        "attendance": attendance
    })
    seen_games.add(game['game_id'])
    
    
#Remove duplicates
# games_dict = {g['gamePk']: g for g in games}
# games = list(games_dict.values())

# pprint(games)
with open(f"games/{games[0]['gametime'][:10]}_games.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=games[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(games)  # Step 4: Write rows     

In [174]:
teams = {}
players = {}

In [98]:
def normalize_name(name_raw):
    if ',' in name_raw:
        last, first = [part.strip() for part in name_raw.split(',', 1)]
        return f"{first} {last}".strip()
    return name_raw

In [108]:
#Get special stats for pitchers from schedule
for game in schedule:
    gamePk = game['game_id']
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    #pprint(boxscore['info'])
    for item in boxscore['info']:
        player_stats_temp = []
        if item['label'] == "Pitches-Strikes":
            statType = 'pitching_player_game'
            raw = item['value']
            entries = [entry.strip() for entry in raw.strip('.').split(';')]
            for entry in entries:
                parts = entry.rsplit(" ", 1)
                if len(parts) == 2:
                    name_part, stat_part = parts
                    player_stats_temp.append((name_part, stat_part))
            for nameraw, record in player_stats_temp:
                normalized_name = normalize_name(nameraw)
                result = statsapi.lookup_player(normalized_name)
                if result:
                    player_id = result[0]["id"]
                    player_game_stats.append({
                        "statType": statType,
                        "stat":item['label'],
                        "val":record,
                        "gamePk":gamePk,
                        "playerId":player_id
                    })
                    print(f"{item['label']} stored for {result[0]['fullName']}")
                else:
                    print(f"{normalized_name}: NOT FOUND")

        elif item['label'] == "Batters faced":
            statType = "pitching_player_game"
            raw = item['value']
            entries = [entry.strip() for entry in raw.strip('.').split(';')]
            for entry in entries:
                parts = entry.rsplit(" ", 1)
                if len(parts) == 2:
                    name_part, stat_part = parts
                    player_stats_temp.append((name_part, stat_part))
            for nameraw, stat in player_stats_temp:
                normalized_name = normalize_name(nameraw)
                result = statsapi.lookup_player(normalized_name)
                if result:
                    player_id = result[0]["id"]
                    player_game_stats.append({
                        "statType": statType,
                        "stat":item['label'],
                        "val":stat,
                        "gamePk":gamePk,
                        "playerId":player_id
                    })
                    print(f"{item['label']} stored for {result[0]['fullName']}")
                else:
                    print(f"{normalized_name}: NOT FOUND")
        elif item['label'] == "Groundouts-flyouts":
            player_stats_temp = []
            statType = "pitching_player_game"
            raw = item['value']
            entries = [entry.strip() for entry in raw.strip('.').split(';')]
            for entry in entries:
                parts = entry.rsplit(" ", 1)
                if len(parts) == 2:
                    nameraw, stat = parts
                    player_stats_temp.append((nameraw, stat))
                for nameraw, stat in player_stats_temp:
                    normalized_name = normalize_name(nameraw)
                    result = statsapi.lookup_player(normalized_name)
                    if result:
                        player_id = result[0]["id"]
                        player_game_stats.append({
                            "statType": statType,
                            "stat":item['label'],
                            "val":stat,
                            "gamePk":gamePk,
                            "playerId":player_id
                        })
                        print(f"{item['label']} stored for {result[0]['fullName']}")
                    else:
                        print(f"{normalized_name}: NOT FOUND")


Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts stored for Kody Funderburk
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts stored for Kody Funderburk
Groundouts-flyouts stored for Eury Pérez
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts stored for Kody Funderburk
Groundouts-flyouts stored for Eury Pérez
Groundouts-flyouts stored for Cade Gibson
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts stored for Kody Funderburk
Groundouts-flyouts stored for Eury Pérez
Groundouts-flyouts stored for Cade Gibson
Groundouts-flyouts stored for Anthony Bender
Groundouts-flyouts stored for David Festa
Groundouts-flyouts stored for Justin Topa
Groundouts-flyouts

KeyboardInterrupt: 

In [None]:
team_game_stats = []
player_game_stats = []

#Get team stats for games today and player stats by game
for g in games:
    gamePk = g['gamePk']
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    for side in ["home", "away"]:
        #Add game stats for each team in todays games
        team_stats = boxscore["teams"][side]["teamStats"]
        # Create data structure {team_game_stats} to hold game data, team statistics and player statistics by season and game
        #Batting stats (KAV + Game structure)
        for stat, val in team_stats['batting'].items():
            team_game_stats.append({
                "statType": "batting_team_game", 
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "playerId": None,
                "side": side
            })
        for stat, val in team_stats['pitching'].items():
            team_game_stats.append({
                "statType": "pitching_team_game", 
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "playerId": None,
                "side": side
            })
        for stat, val in team_stats['fielding'].items():
            team_game_stats.append({
                "statType": "fielding_team_game", 
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "playerId": None,
                "side": side
            })
        print(f"Loaded {side} team data from game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
        #Player stats (KAV + game + player ID) only if they played any positions
        
        for player_key, player_data in boxscore["teams"][side]["players"].items():
            player_id = player_data["person"]["id"]  # ✅ This is the integer ID
            url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
            params = {"gamePk": gamePk, "stats": "gameLog"}
            response = requests.get(url, params=params)
            player_stats = response.json()
            player_name = player_data['person']['fullName']
            if len(player_stats['stats']) > 0:
                stat_group = player_stats['stats'][0]['group']
                print(stat_group)
                continue
                player_stats = player_stats['stats'][0]['splits'][0]
                if any(pos['code'] == 'P' for pos in player_stats['positionsPlayed']):
                    for stat, val in player_stats['stat']:
                        player_game_stats.append({
                            "stat_type": "pitching_player_game", 
                            "stat":stat, 
                            "val":val, 
                            "gamePk":gamePk, 
                            "playerId":player_id,
                            "side":side
                        })
                else:
                    for stat_val in player_stats['stat']:
                        player_game_stats.append({
                            "stat_type": "hitting_player_game", 
                            "stat":stat, 
                            "val":val, 
                            "gamePk":gamePk, 
                            "playerId":player_id,
                            "side":side
                        })
                print(f"Loaded player data for {player_name} in game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
                
                

                
            
with open(f"team_stats/{games[0]['gametime'][:10]}_game_stats.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=team_game_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(team_game_stats)  # Step 4: Write rows  
    
with open(f"player_stats/{games[0]['gametime'][:10]}_game_stats.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=player_game_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(player_game_stats)  # Step 4: Write rows      