In [None]:
%pip install requests pandas plotly scikit-learn tqdm
import requests, pandas as pd, json
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
BASE   = "https://site.api.espn.com/apis/site/v2/sports"
LEAGUE = "basketball/nba"
start  = datetime(2024,10,22)
end    = datetime.today()

# Build your date list
dates = [(start + timedelta(days=i)).strftime("%Y%m%d")
         for i in range((end-start).days+1)]

# Create a Session with retry logic
session = requests.Session()
retry_strategy = Retry(
    total=5,                
    backoff_factor=1,       
    status_forcelist=[429,500,502,503,504],
    allowed_methods=["GET"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

In [22]:
game_ids = []
for d in tqdm(dates, desc="Scoreboards"):
    try:
        resp = session.get(f"{BASE}/{LEAGUE}/scoreboard",
                           params={"dates":d},
                           timeout=5)
        resp.raise_for_status()
        for ev in resp.json().get("events", []):
            if ev["status"]["type"]["completed"]:
                game_ids.append(ev["id"])
    except Exception as e:
        # log and skip this date if it fails repeatedly
        print(f"⚠️  Failed date {d}: {e}")

print(f"Total games fetched: {len(game_ids)}")


Scoreboards: 100%|██████████| 196/196 [00:41<00:00,  4.69it/s]

Total games fetched: 1286





In [23]:
def fetch_stats(gid):
    try:
        summary = session.get(f"{BASE}/{LEAGUE}/summary",
                              params={"event":gid},
                              timeout=5).json()
        out = []
        for team in summary["boxscore"]["players"]:
            blk  = team["statistics"][0]
            keys = blk["keys"]
            for ath in blk["athletes"]:
                row = dict(zip(keys, ath["stats"]),
                           game_id=gid,
                           player=ath["athlete"]["displayName"])
                out.append(row)
        return out
    except Exception as e:
        print(f"⚠️  Failed game {gid}: {e}")
        return []
    
    rows = []
# reduce workers to 5 to ease the load
with ThreadPoolExecutor(max_workers=5) as pool:
    futures = {pool.submit(fetch_stats, gid): gid for gid in game_ids}
    for f in tqdm(as_completed(futures), total=len(futures), desc="Boxscores"):
        rows.extend(f.result())

df = pd.DataFrame(rows)
print("Total player-game rows:", len(df))
df.head(20)


Boxscores: 100%|██████████| 1286/1286 [00:54<00:00, 23.50it/s]

Total player-game rows: 65456





Unnamed: 0,minutes,fieldGoalsMade-fieldGoalsAttempted,threePointFieldGoalsMade-threePointFieldGoalsAttempted,freeThrowsMade-freeThrowsAttempted,offensiveRebounds,defensiveRebounds,rebounds,assists,steals,blocks,turnovers,fouls,plusMinus,points,game_id,player
0,24.0,2-7,1-4,1-2,2.0,1.0,3.0,3.0,0.0,1.0,2.0,0.0,-5.0,6.0,401704635,Patrick Williams
1,31.0,8-12,1-2,4-5,2.0,9.0,11.0,1.0,0.0,0.0,1.0,3.0,-14.0,21.0,401704635,Nikola Vucevic
2,35.0,4-13,0-5,2-2,0.0,8.0,8.0,6.0,1.0,1.0,4.0,1.0,-6.0,10.0,401704635,Coby White
3,37.0,10-17,5-8,2-3,1.0,4.0,5.0,3.0,0.0,0.0,7.0,4.0,-15.0,27.0,401704635,Zach LaVine
4,30.0,5-11,0-1,4-7,1.0,4.0,5.0,3.0,0.0,0.0,3.0,1.0,-22.0,14.0,401704635,Josh Giddey
5,17.0,5-8,1-3,4-4,3.0,2.0,5.0,0.0,1.0,0.0,0.0,2.0,2.0,15.0,401704635,Jalen Smith
6,2.0,0-0,0-0,0-0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,401704635,Dalen Terry
7,5.0,0-3,0-3,0-0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,401704635,Matas Buzelis
8,12.0,1-3,0-2,0-0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-9.0,2.0,401704635,Julian Phillips
9,8.0,0-1,0-0,0-2,0.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0,4.0,0.0,401704635,Talen Horton-Tucker


In [25]:
numeric_cols = [
    "minutes","points","rebounds","assists",
    "steals","blocks","turnovers","threePointFieldGoalsMade"
]
for c in numeric_cols:
    if c in df:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# Prop stats to build lines for
CATS = [
    "points",
    "rebounds",
    "assists",
    "threePointFieldGoalsMade",
    "steals",
    "blocks"
]

In [32]:
top10 = {}
for cat in CATS:
    if cat not in df:
        continue
    avg = df.groupby("player")[cat].mean()
    top10[cat] = avg.nlargest(10).index.tolist()
    print(f"Top 10 for {cat}: {top10[cat][:3]} …")

Top 10 for points: ['Shai Gilgeous-Alexander', 'Nikola Jokic', 'Giannis Antetokounmpo'] …
Top 10 for rebounds: ['Domantas Sabonis', 'Nikola Jokic', 'Ivica Zubac'] …
Top 10 for assists: ['Trae Young', 'Nikola Jokic', 'Tyrese Haliburton'] …
Top 10 for steals: ['Dyson Daniels', 'Dejounte Murray', 'Nikola Jokic'] …
Top 10 for blocks: ['Victor Wembanyama', 'Chet Holmgren', 'Walker Kessler'] …


In [33]:
dist_records = []
for cat, players in top10.items():
    for player in players:
        stats = df.loc[df.player == player, cat]
        dist_records.append({
            "player": player,
            "stat":    cat,
            "mean":    float(stats.mean()),
            "std":     float(stats.std()),
            "q25":     float(stats.quantile(0.25)),
            "q50":     float(stats.quantile(0.50)),
            "q75":     float(stats.quantile(0.75))
        })

dist_df = pd.DataFrame(dist_records)
# only the columns we care about now
dist_df = dist_df[["player","stat","mean","std","q25","q50","q75"]]
dist_df.head(10)

Unnamed: 0,player,stat,mean,std,q25,q50,q75
0,Shai Gilgeous-Alexander,points,31.365854,9.080434,27.0,31.0,36.0
1,Nikola Jokic,points,28.832215,10.276845,23.0,28.0,35.0
2,Giannis Antetokounmpo,points,28.808219,9.883679,24.25,30.0,35.0
3,Luka Doncic,points,28.039216,8.076272,21.0,29.5,33.75
4,Anthony Edwards,points,26.791411,9.877305,21.0,27.0,33.0
5,Paolo Banchero,points,26.092784,8.892978,20.0,25.0,32.0
6,Kevin Durant,points,25.796875,7.93116,22.0,26.0,30.25
7,Devin Booker,points,25.761905,8.788484,18.0,26.0,33.0
8,Jalen Brunson,points,25.262411,10.149346,20.0,25.0,32.0
9,LaMelo Ball,points,25.191489,9.290157,19.25,25.0,31.0


In [None]:
'''
import plotly.express as px
fig = px.bar(
    dist_df[dist_df.stat=="points"].nlargest(10, "mean"),
    x="player", y=["mean","q75"],
    barmode="group",
    title="Top 10 Scorers: Mean vs 75th Percentile"
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

'''

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [34]:
dist_records = []
for cat, players in top10.items():
    for player in players:
        stats = df.loc[df.player == player, cat]
        dist_records.append({
            "player": player,
            "stat":    cat,
            "mean":    float(stats.mean()),
            "std":     float(stats.std()),
            "q25":     float(stats.quantile(0.25)),
            "q50":     float(stats.quantile(0.50)),
            "q75":     float(stats.quantile(0.75))
        })

dist_df = pd.DataFrame(dist_records)
# only the columns we care about now
dist_df = dist_df[["player","stat","mean","std","q25","q50","q75"]]
dist_df.head(10)

Unnamed: 0,player,stat,mean,std,q25,q50,q75
0,Shai Gilgeous-Alexander,points,31.365854,9.080434,27.0,31.0,36.0
1,Nikola Jokic,points,28.832215,10.276845,23.0,28.0,35.0
2,Giannis Antetokounmpo,points,28.808219,9.883679,24.25,30.0,35.0
3,Luka Doncic,points,28.039216,8.076272,21.0,29.5,33.75
4,Anthony Edwards,points,26.791411,9.877305,21.0,27.0,33.0
5,Paolo Banchero,points,26.092784,8.892978,20.0,25.0,32.0
6,Kevin Durant,points,25.796875,7.93116,22.0,26.0,30.25
7,Devin Booker,points,25.761905,8.788484,18.0,26.0,33.0
8,Jalen Brunson,points,25.262411,10.149346,20.0,25.0,32.0
9,LaMelo Ball,points,25.191489,9.290157,19.25,25.0,31.0


In [36]:
# 9) Build LLM prompt & write to file
snippet = dist_df.to_dict(orient="records")
prompt = f"""
You are an expert sportsbook AI. Below is each star’s full-season distribution for key stat categories:

{json.dumps(snippet, indent=2)}

For each player+stat, propose an OVER/UNDER prop line (to the nearest 0.5)
and explain your reasoning in one sentence (e.g. usage, consistency, upside).

Format:
Player (Stat):
 • Points OVER/UNDER X.Y — Reason...
 • Rebounds OVER/UNDER X.Y — Reason...
 • Assists OVER/UNDER X.Y — Reason...
 • 3PM OVER/UNDER X.Y — Reason...
 • Steals OVER/UNDER X.Y — Reason...
 • Blocks OVER/UNDER X.Y — Reason...
"""
with open("prompt.txt","w") as f:
    f.write(prompt)

print("🔑 Prompt written to prompt.txt. Next, run locally:")
print("   ollama run mist/mistral-7b-v0.1 --prompt-file prompt.txt")

🔑 Prompt written to prompt.txt. Next, run locally:
   ollama run mist/mistral-7b-v0.1 --prompt-file prompt.txt


In [37]:
import re

# 1. Load the raw LLM text
with open("prompt.txt", "r") as f:
    raw = subprocess.run(
        ["cat", "prompt.txt"], capture_output=True, text=True
    ).stdout  # or paste your ollama stdout here

# 2. Parse each “Player (Stat): …” block
lines = []
pattern = re.compile(r"^\s*\d+\.\s*(.+?)\s*\((.+?)\):\s*[-–]\s*(OVER|UNDER)\s*([\d\.]+)\s*—\s*(.+)$", re.MULTILINE)
for match in pattern.finditer(raw):
    player, stat, side, value, reason = match.groups()
    lines.append({
        "player": player.strip(),
        "stat": stat.strip(),
        "side": side,
        "line": float(value),
        "reason": reason.strip()
    })

recs_df = pd.DataFrame(lines)
recs_df


In [38]:
final = recs_df.merge(
    dist_df[["player","stat","q50","q75"]],
    on=["player","stat"],
    how="left"
)
final = final[["player","stat","side","line","q50","q75","reason"]]
final


KeyError: 'player'