In [3]:
import requests
import pandas as pd
import time
import os
import json

In [7]:
# Function to retrieve roster update data
def get_update_data(id:int):
    url = "https://mlb25.theshow.com/apis/roster_update.json"
    params = {"id" : id}
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    data = response.json()
    return data

# Function to get player data from MLBTS25
def get_player_data(uuid:str):
    url = "https://mlb25.theshow.com/apis/item.json" 
    params = {"uuid" : uuid}
    try:
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        return response.json()
    except Exception as e:
        print(f"Exception while requesting UUID {uuid}: {e}")
        return None
    
def get_cached_player_data(uuid: str):
    cache_dir = "./cache"
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, f"player_{uuid}.json")

    # If cache exists, return from disk
    if os.path.exists(cache_path):
        with open(cache_path, "r") as f:
            return json.load(f)

    # Otherwise, fetch from API and cache it
    url = "https://mlb25.theshow.com/apis/item.json"  # Note the URL format
    params = {"uuid":uuid}
    try:
        response = requests.get(url, params = params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        data = response.json()
        with open(cache_path, "w") as f:
            json.dump(data, f)
        return data
    except Exception as e:
        print(f"Exception while fetching UUID {uuid}: {e}")
        return None

Collect data from MLB API:

In [None]:
players = []

update_data = get_update_data(8)

# Extract roster update data from API
for player in update_data.get("attribute_changes", []):
    name = player.get("name")
    old_overall = player.get("old_rank")
    new_overall = player.get("current_rank")
    # Extract players UUID
    item = player.get("item", {})
    uuid = item.get("uuid") if item else None
    # Add label if upgraded or not
    label = 1 if new_overall > old_overall else 0
    # Append to our running list of players
    players.append({
        "player_name": name,
        "player_id": uuid,
        "old_overall": old_overall,
        "new_overall": new_overall,
        "upgrade_label": label
    })



# Convert to data frame
df_changes = pd.DataFrame(players)

# Extract player attribute data from API
uuids = df_changes["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_cached_player_data(uuid)

        if not data:
            print(f"Skipping UUID {uuid} - no data returned")
            continue

        # Pull key hitting attributes
        player_name = data.get("name")
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("plate_vision")
        discipline = data.get("plate_discipline")
        overall = data.get("ovr")

        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "overall_rating": overall
        })


    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

    time.sleep(0.25)

print(player_attributes)
print(df_changes.count())


Load in IRL Data

In [None]:
irl_stats_df = pd.read_csv(r"C:\Users\mgams\Projects\mlb-market-app\prediction_model\data\fangraphs-leaderboards.csv")
print(irl_stats_df.head())

Merging:

In [None]:
df_changes["player_name_clean"] = df_changes["player_name"].str.lower().str.strip()
irl_stats_df["player_name_clean"] = irl_stats_df["Name"].str.lower().str.strip()

merged_df = df_changes.merge(
    irl_stats_df,
    on="player_name_clean",
    how="left"
)

training_df = merged_df[[
    "player_name", "player_id", "old_overall", "new_overall", "upgrade_label",
    "AVG", "OBP", "SLG", "HR"  # or whatever FanGraphs columns are present
]]

training_df = training_df.dropna(subset=["AVG", "OBP", "SLG", "HR"])
training_df.count()

In [None]:
# List of player UUIDs from your labeled dataset
uuids = training_df["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_player_data(uuid)

        # Pull key hitting attributes from API response
        player_name = data.get("name")
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("vision")
        discipline = data.get("discipline")
        overall = data.get("overall")

        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "overall_rating": overall
        })

    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

Training Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
# Select input features
feature_cols = ["AVG", "OBP", "SLG", "HR"]  # you can expand this later
X = training_df[feature_cols]
y = training_df["upgrade_label"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a basic model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Performance evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

importances = clf.feature_importances_
plt.barh(feature_cols, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest: IRL Stats vs. SDS Upgrades")
plt.show()