In [3]:
import requests
from datetime import date, timedelta, datetime
import time
import sys
import statsapi
from pprint import pprint
import json
import os
import csv
# from google.cloud import bigquery



In [2]:
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.expanduser("~/Documents/jupyter/MLB/credentials/mlb-analysis-463501-869e729270f3.json")

# client = bigquery.Client()
# print("Project:", client.project)

Project: mlb-analysis-463501


In [4]:
games = []
seen_games = set()

In [5]:
#Get today/yesterdays date 
today = date.today()
yesterday = today - timedelta(days = 1)
yesterday = yesterday.isoformat()
all_teams = '2025-07-03'

In [7]:
#Function for team season stats
def team_stat_data(team_id, group, season, stats='season'):
    url = f"https://statsapi.mlb.com/api/v1/teams/{team_id}/stats"
    params = {
        "season": season,
        "group": group,
        "stats": stats,
        "updateDate": today
    }
    r = requests.get(url, params=params)
    r.raise_for_status()
    return r.json()["stats"][0]["splits"][0]

In [8]:
#Function for player season stats
def player_stat_data(player_id, group, season, stats='season'):
    url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats?stats={stats}&group={group}&season={season}"
    params = {
        "season": season,
        "group": group,
        "stats": stats,
        "player_id": player_id,
        "updateDate": today
    }
    r = requests.get(url, params = params)
    r.raise_for_status()
    return r.json()["stats"][0]["splits"][0]

In [None]:
#Normalize name for pulling special fields from boxscore
def normalize_name(name_raw):
    if ',' in name_raw:
        last, first = [part.strip() for part in name_raw.split(',', 1)]
        return f"{first} {last}".strip()
    return name_raw

In [28]:
#Get team season stats
teams = statsapi.get("teams",{"sportId": 1})['teams']
currentSeason = 2025
team_season_stats = []
for team in teams:
    teamId = team['id']
    teamName = team['name']
    for group in ["hitting","pitching","fielding"]:
        season_stats = team_stat_data(teamId, group, currentSeason, stats='season')
        # Check if the stat key exists and is a dictionary
        if 'stat' in season_stats and isinstance(season_stats['stat'], dict):
            for key, val in season_stats['stat'].items():            
                team_season_stats.append({
                    "domain": "team",
                    "range": "season",
                    "group": group,
                    "stat": key,
                    "val": val,
                    "season": currentSeason,
                    "teamId": teamId,
                    "updateDate": today
                })
            print(f"Stored team {group} season stats for {teamName}")
                
            
            
with open(f"team_stats/{today}_season_stats_{currentSeason}.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=team_season_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(team_season_stats)  # Step 4: Write rows            
        

Stored team hitting season stats for Athletics
Stored team pitching season stats for Athletics
Stored team fielding season stats for Athletics
Stored team hitting season stats for Pittsburgh Pirates
Stored team pitching season stats for Pittsburgh Pirates
Stored team fielding season stats for Pittsburgh Pirates
Stored team hitting season stats for San Diego Padres
Stored team pitching season stats for San Diego Padres
Stored team fielding season stats for San Diego Padres
Stored team hitting season stats for Seattle Mariners
Stored team pitching season stats for Seattle Mariners
Stored team fielding season stats for Seattle Mariners
Stored team hitting season stats for San Francisco Giants
Stored team pitching season stats for San Francisco Giants
Stored team fielding season stats for San Francisco Giants
Stored team hitting season stats for St. Louis Cardinals
Stored team pitching season stats for St. Louis Cardinals
Stored team fielding season stats for St. Louis Cardinals
Stored tea

In [54]:
#PARAMETERS: start/end date
start_date = '2025-07-03'
end_date = today
print(start_date)
print(end_date)

2025-07-03
2025-07-05


In [56]:
# # Get schedule for start/end date
schedule = statsapi.schedule(start_date, end_date, sportId=1, leagueId=104)
games = []
weather = []
seen_games = set()


#Pull available game data from schedule
for game in schedule:
    #Get weather
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    for item in boxscore['info']:
        if item['label'] == "Weather":
            weather = item['value']
        elif item['label'] == "Att":
            attendance = item['value']
        elif item['label'] == 'Venue':
            venue = item['value']
    # sys.exit()
    games.append({
        "gamePk": game['game_id'],
        "gametime": game['game_datetime'],
        "season": game['game_date'][:4],
        "awayId": game['away_id'],
        "awayName": game['away_name'],
        "homeId": game['home_id'],
        "homeName": game['home_name'],
        "weather": weather,
        # "attendance": attendance,
        "venue": venue
    })
    seen_games.add(game['game_id'])
    
    
#Remove duplicates
# games_dict = {g['gamePk']: g for g in games}
# games = list(games_dict.values())

# pprint(games)
with open(f"games/{games[0]['gametime'][:10]}_games.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=games[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(games)  # Step 4: Write rows     

In [None]:
team_game_stats = []
player_game_stats = []

#Get team stats for games today and player stats by game
for g in games:
    gamePk = g['gamePk']
    boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
    for side in ["home", "away"]:
        #Add game stats for each team in todays games
        team_stats = boxscore["teams"][side]["teamStats"]
        # Create data structure {team_game_stats} to hold game data, team statistics and player statistics by season and game
        #Batting stats (KAV + Game structure)
        for stat, val in team_stats['batting'].items():
            team_game_stats.append({
                "domain": "player",
                "group":"batting",
                "range":"game",
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "side": side
            })
        for stat, val in team_stats['pitching'].items():
            team_game_stats.append({
                "domain": "team",
                "group":"pitching",
                "range":"game",
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "side": side
            })
        for stat, val in team_stats['fielding'].items():
            team_game_stats.append({
                "domain": "team",
                "group":"fielding",
                "range":"game",
                "stat": stat, 
                "val": val, 
                "gamePk": gamePk,
                "side": side
            })
        print(f"Loaded {side} team data from game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
        #Player stats (KAV + game + player ID) only if they played any positions
        
        for player_key, player_data in boxscore["teams"][side]["players"].items():
            player_id = player_data["person"]["id"]  # ✅ This is the integer ID
            url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
            params = {"gamePk": gamePk, "stats": "gameLog"}
            response = requests.get(url, params=params)
            player_stats = response.json()
            player_name = player_data['person']['fullName']
            if len(player_stats['stats']) > 0:
                
                # continue
                player_stats = player_stats['stats'][0]['splits'][0]
                stat_dict = player_stats['stat']
                if 'positionsPlayed' in player_stats:
                    for stat, val in stat_dict.items():
                        player_game_stats.append({
                            "domain": "player",
                            "group":group,
                            "range":"game",
                            "stat":stat, 
                            "val":val, 
                            "gamePk":gamePk, 
                            "playerId":player_id,
                            "side":side,
                            "positionsPlayed": player_stats['positionsPlayed']
                        })
                    print(f"Loaded player data for {player_name} in game: {boxscore['teams']['away']['team']['abbreviation']} vs. {boxscore['teams']['home']['team']['abbreviation']}")
                

                
game_date = games[0]['gametime'].split('T')[0]
with open(f"team_stats/{game_date}_team_game_stats.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=team_game_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(team_game_stats)  # Step 4: Write rows  
    
with open(f"player_stats/{game_date}_player_game_stats.csv", mode="w", newline="", encoding="utf-8") as file:
    # Step 2: Create writer with header
    writer = csv.DictWriter(file, fieldnames=player_game_stats[0].keys())

    writer.writeheader()  # Step 3: Write header
    writer.writerows(player_game_stats)  # Step 4: Write rows      

In [None]:
# #Get special stats for pitchers from schedule
# for game in schedule:
#     gamePk = game['game_id']
#     boxscore = statsapi.get("game_boxscore", {'gamePk': gamePk})
#     #pprint(boxscore['info'])
#     for item in boxscore['info']:
#         player_stats_temp = []
#         if item['label'] == "Pitches-Strikes":
#             statType = 'pitching_player_game'
#             raw = item['value']
#             entries = [entry.strip() for entry in raw.strip('.').split(';')]
#             for entry in entries:
#                 parts = entry.rsplit(" ", 1)
#                 if len(parts) == 2:
#                     name_part, stat_part = parts
#                     player_stats_temp.append((name_part, stat_part))
#             for nameraw, record in player_stats_temp:
#                 normalized_name = normalize_name(nameraw)
#                 result = statsapi.lookup_player(normalized_name)
#                 if result:
#                     player_id = result[0]["id"]
#                     player_game_stats.append({
#                         "statType": statType,
#                         "stat":item['label'],
#                         "val":record,
#                         "gamePk":gamePk,
#                         "playerId":player_id
#                     })
#                     print(f"{item['label']} stored for {result[0]['fullName']}")
#                 else:
#                     print(f"{normalized_name}: NOT FOUND")

#         elif item['label'] == "Batters faced":
#             statType = "pitching_player_game"
#             raw = item['value']
#             entries = [entry.strip() for entry in raw.strip('.').split(';')]
#             for entry in entries:
#                 parts = entry.rsplit(" ", 1)
#                 if len(parts) == 2:
#                     name_part, stat_part = parts
#                     player_stats_temp.append((name_part, stat_part))
#             for nameraw, stat in player_stats_temp:
#                 normalized_name = normalize_name(nameraw)
#                 result = statsapi.lookup_player(normalized_name)
#                 if result:
#                     player_id = result[0]["id"]
#                     player_game_stats.append({
#                         "statType": statType,
#                         "stat":item['label'],
#                         "val":stat,
#                         "gamePk":gamePk,
#                         "playerId":player_id
#                     })
#                     print(f"{item['label']} stored for {result[0]['fullName']}")
#                 else:
#                     print(f"{normalized_name}: NOT FOUND")
#         elif item['label'] == "Groundouts-flyouts":
#             player_stats_temp = []
#             statType = "pitching_player_game"
#             raw = item['value']
#             entries = [entry.strip() for entry in raw.strip('.').split(';')]
#             for entry in entries:
#                 parts = entry.rsplit(" ", 1)
#                 if len(parts) == 2:
#                     nameraw, stat = parts
#                     player_stats_temp.append((nameraw, stat))
#                 for nameraw, stat in player_stats_temp:
#                     normalized_name = normalize_name(nameraw)
#                     result = statsapi.lookup_player(normalized_name)
#                     if result:
#                         player_id = result[0]["id"]
#                         player_game_stats.append({
#                             "statType": statType,
#                             "stat":item['label'],
#                             "val":stat,
#                             "gamePk":gamePk,
#                             "playerId":player_id
#                         })
#                         print(f"{item['label']} stored for {result[0]['fullName']}")
#                     else:
#                         print(f"{normalized_name}: NOT FOUND")


In [97]:
#Get player season stats
def player_season_stats(player_id, group, season=2025):
    url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
    params = {
        "stats": "season",
        "season": season
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    # Extract the stats if available
    if data["stats"] and data["stats"][0]["splits"]:
        stats = data["stats"][0]["splits"][0]["stat"]
        return [stats, {'season': season}]
    else:
        return None

In [None]:
#print(player_game_stats[0].keys())
season_stats = []
seen_players = set()

for game_stats in player_game_stats:
    if game_stats['playerId'] in seen_players:
        continue
    else:
        playerId = game_stats['playerId']
    
    
    data = player_season_stats(playerId, currentSeason)
    season = data[1]
    
    # Get player info (basic details like name, team, position)
    player_info_url = f"https://statsapi.mlb.com/api/v1/people/{playerId}"
    player_info_resp = requests.get(player_info_url)
    player_info = player_info_resp.json()
    
    #Load stats into season_stats
    # pprint(data[0])
    for stat, val in data[0].items():
        if player_info.get("people"):
            person = player_info["people"][0]
            player_id = person['id']
            playerName = person['fullName']
            primaryPosition = person['primaryPosition']['abbreviation']
            season_stats.append({
                "domain": "player",
                "season":season,
                "stat":stat, 
                "val":val,  
                "playerId":player_id,
                "primaryPosition": primaryPosition
            })
            print(f"Got {season} season stats for {playerName}")
            
    seen_players.add(playerId)
    
    
    with open(f"player_stats/{season_stats[0]['season']}_{playerName}_season_stats.csv", mode="w", newline="", encoding="utf-8") as file:
        # Step 2: Create writer with header
        writer = csv.DictWriter(file, fieldnames=season_stats[0].keys())

        writer.writeheader()  # Step 3: Write header
        writer.writerows(season_stats)  # Step 4: Write rows    
    
