In [24]:
import requests
import pandas as pd
import time
import os
import sys
import json

In [None]:
# Add the path to your ML/ folder (project root)
project_root = os.path.abspath("..")  # or use "../../" if you're nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

In [29]:
%load_ext autoreload
%autoreload 2

In [None]:
# Function to retrieve roster update data
def get_update_data(id:int):
    url = "https://mlb25.theshow.com/apis/roster_update.json"
    params = {"id" : id}
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    data = response.json()
    return data

# Function to get player data from MLBTS25
def get_player_data(uuid:str):
    url = "https://mlb25.theshow.com/apis/item.json" 
    params = {"uuid" : uuid}
    try:
        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        return response.json()
    except Exception as e:
        print(f"Exception while requesting UUID {uuid}: {e}")
        return None
    
def get_cached_player_data(uuid: str):
    cache_dir = "./cache-5-23"
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, f"player_{uuid}.json")

    # If cache exists, return from disk
    if os.path.exists(cache_path):
        print(f"[CACHE HIT] {uuid}")
        with open(cache_path, "r") as f:
            return json.load(f)

    print(f"[CACHE MISS → API] {uuid}")
    # Otherwise, fetch from API and cache it
    url = "https://mlb25.theshow.com/apis/item.json"  # Note the URL format
    params = {"uuid":uuid}
    try:
        response = requests.get(url, params = params)
        if response.status_code != 200:
            print(f"Failed for UUID {uuid} — status {response.status_code}")
            return None
        data = response.json()
        with open(cache_path, "w") as f:
            json.dump(data, f)
        return data
    except Exception as e:
        print(f"Exception while fetching UUID {uuid}: {e}")
        return None

### ***Collect data from MLB API:***

In [None]:
players = []

update_data = get_update_data(8)

# Extract roster update data from API
for player in update_data.get("attribute_changes", []):
    name = player.get("name")
    old_overall = player.get("old_rank")
    new_overall = player.get("current_rank")
    # Extract players UUID
    item = player.get("item", {})
    uuid = item.get("uuid") if item else None
    # Add label if upgraded or not
    label = 1 if new_overall > old_overall else 0
    # Append to our running list of players
    players.append({
        "player_name": name,
        "player_id": uuid,
        "old_overall": old_overall,
        "new_overall": new_overall,
        "upgrade_label": label
    })



# Convert to data frame
df_changes = pd.DataFrame(players)

# Extract player attribute data from API
uuids = df_changes["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_cached_player_data(uuid)

        if not data:
            print(f"Skipping UUID {uuid} - no data returned")
            continue

        # Pull key attributes
        player_name = data.get("name")
        overall = data.get("ovr")
        position = data.get("display_position")
        is_hitter = data.get("is_hitter")

        # Pull key hitting attributes
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("plate_vision")
        discipline = data.get("plate_discipline")

        # Pull key pitching attributes
        hits_per_9 = data.get("hits_per_bf")
        k_per_9 = data.get("k_per_bf")
        bb_per_9 = data.get("bb_per_bf")
        hr_per_9 = data.get("hr_per_bf")

        # Add this attributes to in memory list 
        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "overall_rating": overall,
            "is_hitter": is_hitter,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "hits_per_9": hits_per_9,
            "k_per_9": k_per_9,
            "bb_per_9": bb_per_9,
            "hr_per_9": hr_per_9
        })


    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

    time.sleep(0.25)

print(player_attributes)
print(df_changes.count())


In [43]:
from prediction_model.data_loader import load_roster_update_data
roster_update_data = load_roster_update_data(12)
roster_update_data.head()

Unnamed: 0,player_name,player_id,old_overall,new_overall,upgrade_label
0,José Ramírez,9f7525e81c88f596b6b62c68427d4df7,91,92,1
1,Tarik Skubal,b2585f509345e30749a913d76f462bc3,89,91,1
2,Ketel Marte,8b04eba77f2245a80433843a3b8264a4,89,91,1
3,Shohei Ohtani,2ad8d7208c18f493011f8cf0426d0fad,91,91,0
4,Bobby Witt Jr.,39bd515b8dce9d0de8565f8f6bc0463a,91,91,0


In [36]:
from prediction_model.data_loader import load_player_attributes
uuids = roster_update_data["player_id"].dropna().unique().tolist()
attributes_df = load_player_attributes(uuids, "../data/caches/post-ru-6-13")


[Cache miss] 9f7525e81c88f596b6b62c68427d4df7
Calling API for UUID 9f7525e81c88f596b6b62c68427d4df7
[Cache miss] b2585f509345e30749a913d76f462bc3
Calling API for UUID b2585f509345e30749a913d76f462bc3
[Cache miss] 8b04eba77f2245a80433843a3b8264a4
Calling API for UUID 8b04eba77f2245a80433843a3b8264a4
[Cache miss] 2ad8d7208c18f493011f8cf0426d0fad
Calling API for UUID 2ad8d7208c18f493011f8cf0426d0fad
[Cache miss] 39bd515b8dce9d0de8565f8f6bc0463a
Calling API for UUID 39bd515b8dce9d0de8565f8f6bc0463a
[Cache miss] 514cce4a132d7b9e56401205f68d9c04
Calling API for UUID 514cce4a132d7b9e56401205f68d9c04
[Cache miss] 8234cce56c28676c57fe7edfd27ead32
Calling API for UUID 8234cce56c28676c57fe7edfd27ead32
[Cache miss] 5a58eebb09702f888abeccdba79546a6
Calling API for UUID 5a58eebb09702f888abeccdba79546a6
[Cache miss] 1c5d43a0f69492a369dba76354a3fe52
Calling API for UUID 1c5d43a0f69492a369dba76354a3fe52
[Cache miss] 47cd0dad82d80e6545d96c0fe0fe8da1
Calling API for UUID 47cd0dad82d80e6545d96c0fe0fe8da1


In [37]:
attributes_df.count()

player_name       631
player_id         631
overall_rating    631
is_hitter         631
contact_left      631
contact_right     631
power_left        631
power_right       631
vision            631
discipline        631
hits_per_9        631
k_per_9           631
bb_per_9          631
hr_per_9          631
dtype: int64

In [None]:
from prediction_model.data_loader import get_live_series_uuids_from_listings
live_series_uuids = get_live_series_uuids_from_listings()
print(f"Total live series UUIDs found: {len(live_series_uuids)}")


✅ Page 1 fetched. Total UUIDs so far: 25
✅ Page 2 fetched. Total UUIDs so far: 50
✅ Page 3 fetched. Total UUIDs so far: 75
✅ Page 4 fetched. Total UUIDs so far: 100
✅ Page 5 fetched. Total UUIDs so far: 125
✅ Page 6 fetched. Total UUIDs so far: 150
✅ Page 7 fetched. Total UUIDs so far: 175
✅ Page 8 fetched. Total UUIDs so far: 200
✅ Page 9 fetched. Total UUIDs so far: 225
✅ Page 10 fetched. Total UUIDs so far: 250
✅ Page 11 fetched. Total UUIDs so far: 275
✅ Page 12 fetched. Total UUIDs so far: 300
✅ Page 13 fetched. Total UUIDs so far: 325
✅ Page 14 fetched. Total UUIDs so far: 350
✅ Page 15 fetched. Total UUIDs so far: 375
✅ Page 16 fetched. Total UUIDs so far: 400
✅ Page 17 fetched. Total UUIDs so far: 425
✅ Page 18 fetched. Total UUIDs so far: 450
✅ Page 19 fetched. Total UUIDs so far: 475
✅ Page 20 fetched. Total UUIDs so far: 500
✅ Page 21 fetched. Total UUIDs so far: 525
✅ Page 22 fetched. Total UUIDs so far: 550
✅ Page 23 fetched. Total UUIDs so far: 575
✅ Page 24 fetched. Tota

{'1000bdd9dc4f3e5599ebcca8e966d5a2', '459983f19ebdf6261bdca3292b360382', '5ab6a5ddc40b5cec25ec18811e43a3e7', 'e05ada5cfb652fec8f14335270bf3785', '289319b9e51a79f61087c4e66a73659d', 'e5ee16ec09671af24141c35df4fb9991', '5e946476809f30de607f5b0b47ac49a4', 'c48f5a2d585c74f35c5abee9a8182559', '632cd7b2a37e8945bddb243a0423fde3', 'f5be2367840a312d1d6db3aa9c5f4740', '9147d9126ccf1b3261a9014b9ce98c81', 'f42523df3fafe56f111474dc70679dcf', '14e4cee178d88fb9aa346dbcc11f2873', '5247fbfc9eb76e05dd0e1b4912fab4f4', 'fb55274ed339c7f48dc377f5ce1e3e9d', 'bfe339860b048949369fc6945cea504d', '4d20fd028bfd46edd5968bd62adf4f73', '9c1ffce10234eb3b15bf318f2f5ec588', 'ff40ff2de9282e8d350635d54fc0bdfc', '4d66dccf027bcd3c73c457a062275639', 'f900c61208841a8347dd4ddbea47b173', '912a6a926b8dcba4d0e28cc3d3d288f4', 'cce4c4d7b34f965073ad5c8e70e090a1', '7c237fddfd9f6c39ab306c4cc7289746', 'e0f1a741eaa73fdf40972a4472d02acb', 'f20c1bc2bdf4a87617d7324d7b788759', '19388c9ff528bfad4e1528a79e97daa8', 'b1183560ea6301d6e151c959b6

### ***Split data into pitchers and hitters***

In [None]:
# Convert player_attributes to DataFrame
attributes_df = pd.DataFrame(player_attributes)

# Split up in game attribute data by pitchers and hitters
hitters_df = attributes_df[attributes_df["is_hitter"] == True].copy()
pitchers_df = attributes_df[attributes_df["is_hitter"] == False].copy()

# Merge is_hitter into the roster update info
df_changes = df_changes.merge(
    attributes_df[["player_id", "is_hitter"]],
    on="player_id",
    how="left"
)

# Split roster update data into pitchers and hitters 
hitters_changes = df_changes[df_changes["is_hitter"] == True]
pitchers_changes = df_changes[df_changes["is_hitter"] == False]


In [None]:
print("Hitters with attribute data:", len(hitters_df))
print("Pitchers with attribute data:", len(pitchers_df))

print("Hitters in roster update:", len(hitters_changes))
print("Pitchers in roster update:", len(pitchers_changes))

# Optional: quick cross-check totals
print("Total roster update entries:", len(df_changes))
print("Sum of hitter + pitcher entries:", len(hitters_changes) + len(pitchers_changes))

print(df_changes["is_hitter"].value_counts())

Load in IRL Data

In [None]:
irl_stats_df = pd.read_csv(r"C:\Users\mgams\Projects\mlb-market-app\prediction_model\data\fangraphs-leaderboards.csv")
print(irl_stats_df.head())

Merging:

In [None]:
df_changes["player_name_clean"] = df_changes["player_name"].str.lower().str.strip()
irl_stats_df["player_name_clean"] = irl_stats_df["Name"].str.lower().str.strip()

merged_df = df_changes.merge(
    irl_stats_df,
    on="player_name_clean",
    how="left"
)

training_df = merged_df[[
    "player_name", "player_id", "old_overall", "new_overall", "upgrade_label",
    "AVG", "OBP", "SLG", "HR"  # or whatever FanGraphs columns are present
]]

training_df = training_df.dropna(subset=["AVG", "OBP", "SLG", "HR"])
training_df.count()

In [None]:
# List of player UUIDs from your labeled dataset
uuids = training_df["player_id"].dropna().unique()

player_attributes = []

for uuid in uuids:
    try:
        data = get_player_data(uuid)

        # Pull key hitting attributes from API response
        player_name = data.get("name")
        contact_left = data.get("contact_left")
        contact_right = data.get("contact_right")
        power_left = data.get("power_left")
        power_right = data.get("power_right")
        vision = data.get("vision")
        discipline = data.get("discipline")
        overall = data.get("overall")

        player_attributes.append({
            "player_name": player_name,
            "player_id": uuid,
            "contact_left": contact_left,
            "contact_right": contact_right,
            "power_left": power_left,
            "power_right": power_right,
            "vision": vision,
            "discipline": discipline,
            "overall_rating": overall
        })

    except Exception as e:
        print(f"Error fetching data for UUID {uuid}: {e}")

Training Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
# Select input features
feature_cols = ["AVG", "OBP", "SLG", "HR"]  # you can expand this later
X = training_df[feature_cols]
y = training_df["upgrade_label"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a basic model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Performance evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

importances = clf.feature_importances_
plt.barh(feature_cols, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest: IRL Stats vs. SDS Upgrades")
plt.show()