# From Draft to Stardom: Predicting NBA Success from College and Combine Stats

### Goal: Predict whether a drafted NBA player becomes an All-Star, starter, or bench player using pre-draft data.

In [None]:
## NBA endpoints needed 
#--> /DraftHistory
#--> /DraftCombineStats
#i also need raw player stats from college games which will be web scraped.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.basketball-reference.com/draft/NBA_2025.html'  

# Get HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the draft table
table = soup.find('table', {'id': 'stats'})

# Extract column headers
headers = [th.getText() for th in table.find('thead').findAll('th')][1:]  # skip rank header

# Extract all rows
rows = table.find('tbody').findAll('tr')

data = []
for row in rows:
    if 'class' in row.attrs and 'thead' in row['class']:
        continue  # Skip header rows within body

    cells = row.find_all('td')
    row_data = []
    player_link = None

    for cell in cells:
        if cell.get('data-stat') == 'player':
            a_tag = cell.find('a')
            if a_tag:
                player_link = 'https://www.basketball-reference.com' + a_tag['href']
                row_data.append(cell.get_text(strip=True))
            else:
                row_data.append(None)
        else:
            row_data.append(cell.get_text(strip=True) if cell else None)

    if len(row_data) < len(headers):
        # Add Nones for missing columns
        row_data += [None] * (len(headers) - len(row_data))

    row_data.append(player_link)
    data.append(row_data)

# Add 'Player_Link' column
headers.append('Player_Link')

# Create DataFrame
df = pd.DataFrame(data, columns=headers)

# Save to CSV
df.to_csv('nba_draft_2025.csv', index=False)

print(df.head())


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Load your cleaned draft table (with player links)
df = pd.read_csv("nba_draft_25.csv")

# Keep only desired columns
keep_cols = ['Pk', 'Tm', 'Player', 'College', 'Player_Link']
df = df[keep_cols]
df

In [None]:

import requests
import time
import random
from bs4 import BeautifulSoup, Comment

# List of user agents to rotate through
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)",
    "Mozilla/5.0 (Windows NT 10.0; rv:114.0) Gecko/20100101 Firefox/114.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko)"
]

def get_college_career_stats(player_url):
    try:
        # Random delay between 15 and 20 seconds
        delay = random.uniform(15, 20)
        print(f"Sleeping for {delay:.2f} seconds...")
        time.sleep(delay)

        # Pick a random user-agent
        headers = {
            "User-Agent": random.choice(user_agents)
        }

        res = requests.get(player_url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')

        # Try direct table first (for newer players)
        table = soup.find('table', {'id': 'all_college_stats'})

        # Fallback to parsing from HTML comments (for older pages)
        if table is None:
            comments = soup.find_all(string=lambda text: isinstance(text, Comment))
            for comment in comments:
                if 'all_college_stats' in comment:
                    comment_soup = BeautifulSoup(comment, 'html.parser')
                    table = comment_soup.find('table', {'id': 'all_college_stats'})
                    break

        if table is None:
            print(f"[X] No college stats table found for {player_url}")
            return {}

        # Locate 'Career' row
        career_row = None
        for row in table.find_all('tr'):
            th = row.find('th')
            if th and th.text.strip().lower() == "career":
                career_row = row
                break

        if not career_row:
            print(f"[X] No 'Career' row found in table for {player_url}")
            return {}

        # Extract stats
        desired_stats = ["fg_pct", "fg3_pct", "ft_pct", "mp_per_g", "pts_per_g", "trb_per_g", "ast_per_g"]
        stats = {stat: None for stat in desired_stats}

        for td in career_row.find_all('td'):
            stat_name = td.get('data-stat')
            if stat_name in desired_stats:
                stats[stat_name] = td.text.strip()

        return stats

    except Exception as e:
        print(f"[ERROR] Failed for {player_url}: {e}")
        return {}

In [None]:
# Track stats
college_stats = []

for index, row in df.iterrows():
    player_url = row["Player_Link"]
    print(f"Scraping: {row['Player']} -> {player_url}")
    stats = get_college_career_stats(player_url)
    print(f"Stats for {row['Player']}: {stats}")
    college_stats.append(stats)
    time.sleep(1)  # to avoid rate-limiting

# Convert scraped stats into a DataFrame
stats_df = pd.DataFrame(college_stats)
final_df = pd.concat([df.reset_index(drop=True), stats_df.reset_index(drop=True)], axis=1)
final_df


In [None]:
# Save the final DataFrame to CSV
final_df.to_csv('nba_draft_2025_with_college_statsv1.csv', index=False)

In [None]:
import pandas as pd
 
nba_df= pd.read_csv('nba_draft_2025_with_college_statsv1.csv')
nba_df=nba_df.drop(columns="Player_Link")
nba_df.head()


In [None]:
nba_df.tail()

In [None]:
stat_cols = ["fg_pct", "fg3_pct", "ft_pct", "mp_per_g", "pts_per_g", "trb_per_g", "ast_per_g"]
nba_df = nba_df.dropna(subset=stat_cols, how='all')
nba_df.tail()

In [None]:
nba_df.isna().sum()

In [None]:
nba_df["College"] = nba_df["College"].fillna("International")

In [None]:
nba_df.to_csv('nba_draft_2025_final.csv', index=False)