In [1]:
import requests
import pandas as pd
import time
import os
import json

In [2]:
# Function to retrieve roster update data
def get_update_data(id:int):
    url = "https://mlb25.theshow.com/apis/roster_update.json"
    params = {"id" : id}
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    data = response.json()
    return data

# Function to get player data from MLBTS25
def get_player_data(uuid:str):
    url = "https://mlb25.theshow.com/apis/item.json" 
    params = {"uuid" : uuid}
    try:
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        return response.json()
    except Exception as e:
        print(f"Exception while requesting UUID {uuid}: {e}")
        return None
    
def get_cached_player_data(uuid: str):
    cache_dir = "./cache-5-23"
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, f"player_{uuid}.json")

    # If cache exists, return from disk
    if os.path.exists(cache_path):
        print(f"[CACHE HIT] {uuid}")
        with open(cache_path, "r") as f:
            return json.load(f)

    print(f"[CACHE MISS → API] {uuid}")
    # Otherwise, fetch from API and cache it
    url = "https://mlb25.theshow.com/apis/item.json"  # Note the URL format
    params = {"uuid":uuid}
    try:
        response = requests.get(url, params = params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        data = response.json()
        with open(cache_path, "w") as f:
            json.dump(data, f)
        return data
    except Exception as e:
        print(f"Exception while fetching UUID {uuid}: {e}")
        return None

### ***Collect data from MLB API:***

In [3]:
players = []

update_data = get_update_data(8)

# Extract roster update data from API
for player in update_data.get("attribute_changes", []):
    name = player.get("name")
    old_overall = player.get("old_rank")
    new_overall = player.get("current_rank")
    # Extract players UUID
    item = player.get("item", {})
    uuid = item.get("uuid") if item else None
    # Add label if upgraded or not
    label = 1 if new_overall > old_overall else 0
    # Append to our running list of players
    players.append({
        "player_name": name,
        "player_id": uuid,
        "old_overall": old_overall,
        "new_overall": new_overall,
        "upgrade_label": label
    })



# Convert to data frame
df_changes = pd.DataFrame(players)

# Extract player attribute data from API
uuids = df_changes["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_cached_player_data(uuid)

        if not data:
            print(f"Skipping UUID {uuid} - no data returned")
            continue

        # Pull key attributes
        player_name = data.get("name")
        overall = data.get("ovr")
        position = data.get("display_position")
        is_hitter = data.get("is_hitter")

        # Pull key hitting attributes
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("plate_vision")
        discipline = data.get("plate_discipline")

        # Pull key pitching attributes
        hits_per_9 = data.get("hits_per_bf")
        k_per_9 = data.get("k_per_bf")
        bb_per_9 = data.get("bb_per_bf")
        hr_per_9 = data.get("hr_per_bf")

        # Add this attributes to in memory list 
        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "overall_rating": overall,
            "is_hitter": is_hitter,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "hits_per_9": hits_per_9,
            "k_per_9": k_per_9,
            "bb_per_9": bb_per_9,
            "hr_per_9": hr_per_9
        })


    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

    time.sleep(0.25)

print(player_attributes)
print(df_changes.count())


[CACHE HIT] 3e67d1f24ebdbbbe125e7040442f6e84
[CACHE HIT] 514cce4a132d7b9e56401205f68d9c04
[CACHE HIT] 9f7525e81c88f596b6b62c68427d4df7
[CACHE HIT] 39bd515b8dce9d0de8565f8f6bc0463a
[CACHE HIT] b6c47a7beb6a9d82dd6149b6643935e4
[CACHE HIT] f99d32d586c4b1d5e120e25770fe5238
[CACHE HIT] b2585f509345e30749a913d76f462bc3
[CACHE HIT] 639c83a3e92c8214eb47a462844527f8
[CACHE HIT] 47cd0dad82d80e6545d96c0fe0fe8da1
[CACHE HIT] 8b04eba77f2245a80433843a3b8264a4
[CACHE HIT] 19f960c299756abbce4d8dbe2e096a1d
[CACHE HIT] 1c5d43a0f69492a369dba76354a3fe52
[CACHE HIT] 20c361dae26c548a297c7bd43b5f6ffa
[CACHE HIT] 8234cce56c28676c57fe7edfd27ead32
[CACHE HIT] ef09d35efd8dcac068723cec8ee2b9b2
[CACHE HIT] 3acb546257bdab68f5cef5ea5fb15785
[CACHE HIT] 0598165d8f7d7e39e209a26fe86d8d6a
[CACHE HIT] bbcb0a179f5728cd615974a511d38bd4
[CACHE HIT] e6b8cc4c9b4ebcb86c67831435057134
[CACHE HIT] 6201006ca73bfb4966550c5228932da6
[CACHE HIT] b1a47a251e71206f1308be13a3cdf43d
[CACHE HIT] 5a58eebb09702f888abeccdba79546a6
[CACHE HIT

### ***Split data into pitchers and hitters***

In [4]:
# Convert player_attributes to DataFrame
attributes_df = pd.DataFrame(player_attributes)

# Split up in game attribute data by pitchers and hitters
hitters_df = attributes_df[attributes_df["is_hitter"] == True].copy()
pitchers_df = attributes_df[attributes_df["is_hitter"] == False].copy()

# Merge is_hitter into the roster update info
df_changes = df_changes.merge(
    attributes_df[["player_id", "is_hitter"]],
    on="player_id",
    how="left"
)

# Split roster update data into pitchers and hitters 
hitters_changes = df_changes[df_changes["is_hitter"] == True]
pitchers_changes = df_changes[df_changes["is_hitter"] == False]


In [5]:
print("Hitters with attribute data:", len(hitters_df))
print("Pitchers with attribute data:", len(pitchers_df))

print("Hitters in roster update:", len(hitters_changes))
print("Pitchers in roster update:", len(pitchers_changes))

# Optional: quick cross-check totals
print("Total roster update entries:", len(df_changes))
print("Sum of hitter + pitcher entries:", len(hitters_changes) + len(pitchers_changes))

print(df_changes["is_hitter"].value_counts())

Hitters with attribute data: 327
Pitchers with attribute data: 335
Hitters in roster update: 327
Pitchers in roster update: 335
Total roster update entries: 662
Sum of hitter + pitcher entries: 662
is_hitter
False    335
True     327
Name: count, dtype: int64


Load in IRL Data

In [None]:
irl_stats_df = pd.read_csv(r"C:\Users\mgams\Projects\mlb-market-app\prediction_model\data\fangraphs-leaderboards.csv")
print(irl_stats_df.head())

Merging:

In [None]:
df_changes["player_name_clean"] = df_changes["player_name"].str.lower().str.strip()
irl_stats_df["player_name_clean"] = irl_stats_df["Name"].str.lower().str.strip()

merged_df = df_changes.merge(
    irl_stats_df,
    on="player_name_clean",
    how="left"
)

training_df = merged_df[[
    "player_name", "player_id", "old_overall", "new_overall", "upgrade_label",
    "AVG", "OBP", "SLG", "HR"  # or whatever FanGraphs columns are present
]]

training_df = training_df.dropna(subset=["AVG", "OBP", "SLG", "HR"])
training_df.count()

In [None]:
# List of player UUIDs from your labeled dataset
uuids = training_df["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_player_data(uuid)

        # Pull key hitting attributes from API response
        player_name = data.get("name")
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("vision")
        discipline = data.get("discipline")
        overall = data.get("overall")

        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "overall_rating": overall
        })

    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

Training Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
# Select input features
feature_cols = ["AVG", "OBP", "SLG", "HR"]  # you can expand this later
X = training_df[feature_cols]
y = training_df["upgrade_label"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a basic model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Performance evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

importances = clf.feature_importances_
plt.barh(feature_cols, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest: IRL Stats vs. SDS Upgrades")
plt.show()