In [None]:
##ChatGPT usage: used to generate initial code structure and logic

%pip install nba_api pandas scikit-learn numpy 

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import playercareerstats, teamyearbyyearstats, commonplayerinfo
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os

Note: you may need to restart the kernel to use updated packages.


In [None]:

def fetch_all_team_totals(start_season='2015-16', end_season='2024-25'):
    nba_teams = teams.get_teams()
    team_data = {}

    for team in nba_teams:
        team_id = team['id']
        team_abbr = team['abbreviation']

        time.sleep(0.6)  
        try:
            team_stats = teamyearbyyearstats.TeamYearByYearStats(team_id=team_id)
            df = team_stats.get_data_frames()[0]
        except Exception as e:
            print(f"Failed to fetch data for team {team_abbr} ({team_id}): {e}")
            continue

       
        df = df[(df['YEAR'] >= start_season) & (df['YEAR'] <= end_season)]

        for _, row in df.iterrows():
            season = row['YEAR']
            games_played = row['GP'] if 'GP' in row and not pd.isna(row['GP']) else (row['WINS'] + row['LOSSES'])
            team_minutes = 5 * 48 * games_played  

            if season not in team_data:
                team_data[season] = {}
            team_data[season][team_abbr] = {
                'MIN': team_minutes,
                'FGA': row['FGA'],
                'FTA': row['FTA'],
                'TOV': row['TOV'],
                'FGM': row['FGM'],
            }
    return team_data

def fetch_player_stats(player_name, start_season='2015-16'):
    player = players.find_players_by_full_name(player_name)[0]
    player_id = player['id']

    career_stats = playercareerstats.PlayerCareerStats(player_id=player_id, per_mode36='Per36')
    df = career_stats.get_data_frames()[0]
    df = df[df['SEASON_ID'] >= start_season]

    info = commonplayerinfo.CommonPlayerInfo(player_id=player_id)
    bio_df = info.get_data_frames()[0]

    birth_date = pd.to_datetime(bio_df.loc[0, 'BIRTHDATE'])
    height_str = bio_df.loc[0, 'HEIGHT']
    if isinstance(height_str, str) and '-' in height_str:
        feet, inches = map(int, height_str.split('-'))
        height_in = feet * 12 + inches
    else:
        height_in = None
    weight = bio_df.loc[0, 'WEIGHT']

    def compute_age(season_id):
        year = int(season_id[:4])
        season_start = pd.Timestamp(f"{year}-10-01")
        return (season_start - birth_date).days // 365

    df['AGE'] = df['SEASON_ID'].apply(compute_age)
    df['HEIGHT_IN'] = height_in
    df['WEIGHT_LB'] = weight
    df['PLAYER_NAME'] = player_name

    return df

def calculate_advanced_stats(df, team_totals):
    # True Shooting Percentage
    df['TS_PCT'] = df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA']))
    df['TS_PCT'] = df['TS_PCT'].fillna(0)

    # Effective FG%
    df['EFG_PCT'] = (df['FGM'] + 0.5 * df['FG3M']) / df['FGA']
    df['EFG_PCT'] = df['EFG_PCT'].fillna(0)

    def usage_pct(row):
        season = row['SEASON_ID']
        team = row['TEAM_ABBREVIATION']
        if season in team_totals and team in team_totals[season]:
            t = team_totals[season][team]
            numerator = (row['FGA'] + 0.44 * row['FTA'] + row['TOV']) * (t['MIN'] / 5)
            denominator = row['MIN'] * (t['FGA'] + 0.44 * t['FTA'] + t['TOV'])
            if denominator > 0:
                return 100 * numerator / denominator
        return 0

    df['USG_PCT'] = df.apply(usage_pct, axis=1)
    return df


team_totals = fetch_all_team_totals('2015-16', '2024-25')
player_df = fetch_player_stats('Luka Doncic', '2018-19')
player_df = calculate_advanced_stats(player_df, team_totals)

print(player_df[['SEASON_ID', 'PLAYER_NAME', 'PTS', 'AST', 'REB', 'STL', 'BLK', 'TS_PCT', 'EFG_PCT', 'USG_PCT', 'AGE', 'HEIGHT_IN', 'WEIGHT_LB']])



  SEASON_ID  PLAYER_NAME   PTS  AST   REB  STL  BLK    TS_PCT   EFG_PCT  \
0   2018-19  Luka Doncic  23.7  6.7   8.7  1.2  0.4  0.546083  0.500000   
1   2019-20  Luka Doncic  30.9  9.5  10.1  1.1  0.2  0.583989  0.529412   
2   2020-21  Luka Doncic  29.1  9.0   8.4  1.0  0.6  0.586694  0.551163   
3   2021-22  Luka Doncic  28.9  8.9   9.3  1.2  0.6  0.569167  0.525000   
4   2022-23  Luka Doncic  32.2  8.0   8.6  1.4  0.5  0.609387  0.559633   
5   2023-24  Luka Doncic  32.5  9.4   8.9  1.4  0.5  0.616651  0.570485   
6   2024-25  Luka Doncic  28.4  7.9   8.4  2.0  0.4  0.583210  0.542056   
7   2024-25  Luka Doncic  28.9  7.7   8.3  1.6  0.4  0.589700  0.529268   
8   2024-25  Luka Doncic  28.7  7.8   8.3  1.8  0.4  0.587633  0.535885   

    USG_PCT  AGE  HEIGHT_IN WEIGHT_LB  
0  0.470521   19         78       230  
1  0.641705   20         78       230  
2  0.571976   21         78       230  
3  0.584762   22         78       230  
4  0.560545   23         78       230  
5  0.4938

In [None]:
def get_season_range(start='2015-16', end='2024-25'):
    start_year = int(start[:4])
    end_year = int(end[:4])
    return [f"{year}-{str(year+1)[-2:]}" for year in range(start_year, end_year + 1)]


def export_player_to_csv_fast(player_name, team_totals, filename='NBAplayer_stats.csv', deduplicate=True):
    
    player_df = fetch_player_stats(player_name, start_season='2015-16')
    player_df = calculate_advanced_stats(player_df, team_totals)

    
    keep_cols = ['SEASON_ID', 'PLAYER_NAME', 'AGE', 'HEIGHT_IN', 'WEIGHT_LB', 'PTS', 'AST', 'REB', 'STL', 'BLK', 'TS_PCT', 'EFG_PCT', 'USG_PCT']
    player_df = player_df[keep_cols]

    # Check if file exists and is not empty
    file_exists = os.path.isfile(filename)
    file_not_empty = file_exists and os.path.getsize(filename) > 0

    if file_not_empty:
        existing_df = pd.read_csv(filename)

        # Remove duplicates if needed
        if deduplicate:
            existing_df = existing_df[~(
                (existing_df['PLAYER_NAME'] == player_name) &
                (existing_df['SEASON_ID'].isin(player_df['SEASON_ID']))
            )]

        
        full_df = pd.concat([existing_df, player_df], ignore_index=True)
        full_df.to_csv(filename, index=False)
    else:
        # First write or file empty
        player_df.to_csv(filename, index=False)

    print(f"Added {player_name} to {filename}")

def fetch_with_retry(player_name, team_totals, filename, retries=3, delay=10):
    attempt = 0
    while attempt < retries:
        try:
            export_player_to_csv_fast(player_name, team_totals, filename)
            print(f"✓ Exported {player_name}")
            break
        except ReadTimeout:
            attempt += 1
            print(f"Timeout on {player_name}, retrying {attempt}/{retries} after {delay} seconds...")
            time.sleep(delay)
        except Exception as e:
            print(f"Skipped {player_name} due to error: {e}")
            break

player_list = ['Al Horford']

for player_name in player_list:
    fetch_with_retry(player_name, team_totals, 'NBAplayer_stats.csv')

✅ Added Al Horford to NBAplayer_stats.csv
✓ Exported Al Horford


In [None]:


nba_data = pd.read_csv('/Users/dewangsahay/Desktop/Coding project/Basketball/NBAplayer_stats.csv')


nba_data_sorted = nba_data.sort_values(by=["PLAYER_NAME", "SEASON_ID"])


lag_stats = ["PTS", "AST", "REB", "STL", "BLK", "TS_PCT", "EFG_PCT", "USG_PCT"]
for stat in lag_stats:
    nba_data_sorted[f"{stat}_PREV"] = nba_data_sorted.groupby("PLAYER_NAME")[stat].shift(1)


shifted_targets = nba_data_sorted.groupby("PLAYER_NAME")[["PTS", "AST", "REB", "STL", "BLK"]].shift(-1)
nba_data_sorted[["PTS_NEXT", "AST_NEXT", "REB_NEXT", "STL_NEXT", "BLK_NEXT"]] = shifted_targets


nba_data_shifted = nba_data_sorted.dropna(subset=[f"{stat}_PREV" for stat in lag_stats] + ["PTS_NEXT", "AST_NEXT", "REB_NEXT", "STL_NEXT", "BLK_NEXT"])


# Feature and target columns
feature_cols = [
    "AGE", "HEIGHT_IN", "WEIGHT_LB",
    "TS_PCT", "EFG_PCT", "USG_PCT",
    "PTS_PREV", "AST_PREV", "REB_PREV", "STL_PREV", "BLK_PREV",
    "TS_PCT_PREV", "EFG_PCT_PREV", "USG_PCT_PREV"
]

target_cols = ["PTS_NEXT", "AST_NEXT", "REB_NEXT", "STL_NEXT", "BLK_NEXT"]

X = nba_data_shifted[feature_cols]
y = nba_data_shifted[target_cols]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
gb_model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42))

rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)


rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)


ensemble_preds = (rf_preds + gb_preds) / 2

# Evaluate
from sklearn.metrics import mean_squared_error, r2_score
rmse_ensemble = np.sqrt(mean_squared_error(y_test, ensemble_preds))
r2_ensemble = r2_score(y_test, ensemble_preds)

print(f"Ensemble RMSE: {rmse_ensemble:.2f}")
print(f"Ensemble R²: {r2_ensemble:.2f}")


results_df = X_test.copy()
results_df[["Actual_PTS", "Actual_AST", "Actual_REB", "Actual_STL", "Actual_BLK"]] = y_test
results_df[["Pred_PTS", "Pred_AST", "Pred_REB", "Pred_STL", "Pred_BLK"]] = ensemble_preds

# Merge back player names
results_df = results_df.merge(
    nba_data_shifted[["PLAYER_NAME"] + feature_cols], 
    on=feature_cols, 
    how="left"
)

sample_results = results_df[[
    "PLAYER_NAME", "Actual_PTS", "Pred_PTS",
    "Actual_AST", "Pred_AST", "Actual_REB", "Pred_REB",
    "Actual_STL", "Pred_STL", "Actual_BLK", "Pred_BLK"
]].head(10)

print(sample_results)

Ensemble RMSE: 1.58
Ensemble R²: 0.73
             PLAYER_NAME  Actual_PTS   Pred_PTS  Actual_AST  Pred_AST  \
0        Kelly Oubre Jr.        15.7  20.370569         1.9  1.322703   
1  Giannis Antetokounmpo        31.2  31.464800         6.7  6.444017   
2  giannis antetokounmpo        31.2  31.464800         6.7  6.444017   
3        Harrison Barnes        15.1  15.937787         1.5  3.168163   
4          Royce O'Neale         8.0   9.609646         2.9  3.690527   
5         Doug McDermott        15.2  13.987091         1.8  1.944354   
6           Monte Morris        14.5  16.169539         4.5  5.451094   
7          Derrick White        18.7  18.932048         4.3  5.965962   
8           Jusuf Nurkic        17.3  21.071177         5.1  5.429870   
9             Chris Paul        11.4  15.474114         9.5  9.212586   

   Actual_REB   Pred_REB  Actual_STL  Pred_STL  Actual_BLK  Pred_BLK  
0         6.4   6.011718         1.6  1.518838         0.5  0.527166  
1        11.8  1

In [None]:
def predict_next_season(player_name, team_totals, model, feature_cols, filename='NBAplayer_stats.csv'):
    # Step 1: Fetch and update data
    export_player_to_csv_fast(player_name, team_totals, filename)

    # Step 2: Load data and add lag features
    df = pd.read_csv(filename)
    df = df[df['PLAYER_NAME'] == player_name].sort_values("SEASON_ID")

    if len(df) < 2:
        print("Not enough seasons of data to predict.")
        return

    # Add lag features (PREV season)
    df_shifted = df.copy()
    for col in ['PTS', 'AST', 'REB', 'STL', 'BLK', 'TS_PCT', 'EFG_PCT', 'USG_PCT']:
        df_shifted[f"{col}_PREV"] = df_shifted[col].shift(1)

    df_shifted = df_shifted.dropna().reset_index(drop=True)

    # Step 3: Get latest season row for prediction
    input_row = df_shifted.iloc[-1][feature_cols].to_frame().T

    # Step 4: Predict using ensemble
    rf_pred = rf_model.predict(input_row)
    gb_pred = gb_model.predict(input_row)
    ensemble_pred = (rf_pred + gb_pred) / 2

    # Step 5: Return as readable DataFrame
    predicted_df = pd.DataFrame(ensemble_pred, columns=["PTS", "AST", "REB", "STL", "BLK"])
    return predicted_df


In [None]:
if __name__ == "__main__":
    player = input("Enter player name: ")
    prediction = predict_next_season(player, team_totals, model=ensemble_preds, feature_cols=feature_cols)
    prediction.to_csv('Output', mode = 'a', header='false', index=False)
    print(prediction)


✅ Added Alex Sarr to NBAplayer_stats.csv
❌ Not enough seasons of data to predict.


AttributeError: 'NoneType' object has no attribute 'to_csv'