In [1]:
#########################################################################################################################
# CALCUATE PREDICTION FOR NEXT GAME DAY #

# Script 3 of 4
# This script Calculates game predictions for the next NBA game day using historical data, rolling averages, and machine learning models,
# and outputs results with probabilities.

# Ensure `_2. 03012025_get_data_next_game_day.ipynb` is executed before running this script.
#########################################################################################################################

In [2]:
ROLLING_WINDOW_SIZE = 7
current_season = 2025

In [3]:
import pandas as pd
import datetime
import numpy as np
import lightgbm as lgb
import os

import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import glob
import datetime
from datetime import datetime, timedelta

import subprocess
import shutil


In [4]:
today = (datetime.now()- timedelta(days=0)).strftime("%Y-%m-%d")


In [5]:
# Constants
target_folder = "D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Next_Game\\"
STAT_DIR = "D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\"

df_path = os.path.join(STAT_DIR, f"nba_games_{today}.csv")

directory_path = r"D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction"
dst_dir = r'D:\_Laufwerk C\11. Sorare\NBA\2025\LightGBM'

open_office_path = "C:\Program Files (x86)\OpenOffice 4\program/scalc"


In [6]:
# Define directory and date format
# Check if file exists
file_path = f"{target_folder}games_df_{today}.csv"
if not os.path.exists(file_path):
    # List files and pick the latest one
    files = sorted(glob.glob(f"{target_folder}games_df_*.csv"))
    if files:
        file_path = files[-1]  # Use the latest available file
        print(f"Using the latest file: {file_path}")
    else:
        print("No files found in the directory.")
        exit()

# Proceed to read the file
games_df = pd.read_csv(file_path, index_col=0)
print(games_df.head(60).to_string(index=False))


home_team away_team  game_date
      CHO       POR 2025-01-24
      PHI       CLE 2025-01-24
      MEM       NOP 2025-01-24


In [7]:
# Function to find the most recent file in the directory if the desired one is not available
def get_latest_available_file(target_folder, prefix="nba_games_", extension=".csv"):
    """Returns the latest available CSV file matching the pattern."""
    available_files = [f for f in os.listdir(target_folder) if f.startswith(prefix) and f.endswith(extension)]
    if available_files:
        latest_file = max(available_files, key=lambda f: os.path.getctime(os.path.join(target_folder, f)))
        return os.path.join(target_folder, latest_file)
    return None

# Check if the specific file for today exists; if not, fallback to the most recent available file
if not os.path.exists(df_path):
    print(f"File for {today} not found. Searching for the latest available file...")
    df_path = get_latest_available_file(DST_DIR)
    if df_path:
        print(f"Using the latest available file: {df_path}")
    else:
        raise FileNotFoundError(f"No suitable file found in the directory: {DST_DIR}")

# Proceed with loading the data
df = pd.read_csv(df_path, index_col=0)
print(df)#.tail())  # Display a portion of the data

# Function to add a target column
def add_target(group):
    """Adds a target column to the DataFrame group based on the 'won' column."""
    group['target'] = group['won'].shift(-1)
    return group

def preprocess_nba_data():
    # Load the data
    df = pd.read_csv(df_path, index_col=0)
    
    # Sort by date
    df = df.sort_values("date")

    # Apply the preprocessing function to each team group
    df = df.groupby('team').apply(add_target)

    # Handle missing values
    df['target'].fillna(2, inplace=True)
    df['target'] = df['target'].astype(int)

    # Identify and remove columns with null values
    nulls = pd.isnull(df).sum()
    nulls = nulls[nulls > 0]
    valid_columns = df.columns[~df.columns.isin(nulls.index)]
    df = df[valid_columns].copy()

    return df

if __name__ == "__main__":
    df = preprocess_nba_data()

    # Columns to be excluded from scaling
    removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

    # Selecting columns that are not in the 'removed_columns' list
    selected_columns = df.columns[~df.columns.isin(removed_columns)]

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()

    # Scale the selected columns and update the DataFrame
    df[selected_columns] = scaler.fit_transform(df[selected_columns])

    #df.to_csv("D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\df_orig.csv", index=False)
    

         fg   fga      fg%    3p   3pa      3p%    ft   fta      ft%   orb  \
mp.1                                                                         
240.0  37.0  96.0  385.000  12.0  29.0  414.000  20.0  26.0  769.000  23.0   
240.0  37.0  82.0  451.000   8.0  27.0  296.000  12.0  15.0    0.800   7.0   
240.0  38.0  94.0  404.000   9.0  29.0    0.310  10.0  17.0  588.000  11.0   
240.0  37.0  87.0  425.000   7.0  19.0  368.000  16.0  23.0  696.000   7.0   
240.0  35.0  83.0  422.000   6.0  18.0  333.000  19.0  27.0  704.000   8.0   
...     ...   ...      ...   ...   ...      ...   ...   ...      ...   ...   
NaN    46.0  98.0    0.469  18.0  40.0    0.450  15.0  20.0    0.750  11.0   
NaN    39.0  74.0    0.527  14.0  31.0    0.452  29.0  36.0    0.806   8.0   
NaN    42.0  88.0    0.477  15.0  37.0    0.405  16.0  22.0    0.727   8.0   
NaN    40.0  82.0    0.488   9.0  28.0    0.321  12.0  19.0    0.632   8.0   
NaN    26.0  76.0    0.342   6.0  25.0    0.240  21.0  31.0    0

In [8]:
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

#print(df_rolling.head(60).to_string(index=False))

home
0.0    0.431337
1.0    0.568663
dtype: float64

In [9]:
####################################################################################################
# CALCULATE THE AVERAGE FOR THE PREVIOUS SEASONS WITH THE ROLLING WINDOW OF 7 FOR LEARNING THE MODEL #
####################################################################################################

# Filter out the games from the current season
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]
#df_rolling = df_rolling[df_rolling['season'] != current_season].copy()

#print(df_rolling.columns)
def find_team_averages(team):
    numeric_columns = team.select_dtypes(include=[np.number])  # Select only numeric columns
    rolling = numeric_columns.rolling(ROLLING_WINDOW_SIZE, min_periods=1).mean()  # Calculate rolling mean
    #rolling[['team', 'season']] = team[['team', 'season']]  # Retain 'team' and 'season' columns in the result
    return rolling

# Apply rolling average
df_rolling.reset_index(drop=True, inplace=True)
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


# Renaming columns with _7 suffix for numeric columns only
rolling_cols = {col: f"{col}_7" for col in df_rolling.columns if col not in ['team', 'season']} #, 'season','season_rolling','season_original','target']}


# Rename the columns
df_rolling.rename(columns=rolling_cols, inplace=True)


In [10]:
df = df.reset_index(drop=True)
df_rolling = df_rolling.reset_index(drop=True)

df = pd.concat([df, df_rolling], axis=1)

#df.to_csv("D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\df_pd.concat.csv", index=False)


df = df.dropna()

print(df)

target_2_rows = df[df['target'] == 2]['target']
print(target_2_rows)

             fg       fga       fg%        3p       3pa       3p%        ft  \
0      0.391304  0.323529  0.656339  0.275862  0.348485  0.351544  0.272727   
1      0.500000  0.338235  0.736429  0.344828  0.303030  0.495249  0.409091   
2      0.369565  0.338235  0.631584  0.275862  0.287879  0.413302  0.386364   
3      0.391304  0.411765  0.000208  0.241379  0.378788  0.286223  0.295455   
4      0.391304  0.441176  0.598091  0.241379  0.257576  0.395487  0.386364   
...         ...       ...       ...       ...       ...       ...       ...   
23443  0.413043  0.382353  0.000240  0.758621  0.621212  0.000581  0.363636   
23444  0.413043  0.382353  0.000240  0.758621  0.621212  0.000581  0.363636   
23445  0.282609  0.426471  0.000121  0.344828  0.575758  0.000283  0.590909   
23446  0.326087  0.514706  0.000118  0.379310  0.590909  0.000304  0.204545   
23447  0.282609  0.500000  0.000092  0.379310  0.636364  0.000284  0.409091   

            fta       orb       drb  ...  ast%_max_

In [11]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    # Ensure the 'team' column is not part of the index and is correctly formatted
    if 'team' in df.columns:
        return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
    else:
        raise KeyError("The 'team' column is missing or not properly formatted in the DataFrame.")

# Ensure the 'team' column exists and is not part of the index
if 'team' not in df.columns:
    print("The 'team' column is missing. Ensure the column is present in your DataFrame.")

# Reset the index to avoid potential issues with multi-indexing
df = df.reset_index(drop=True)

# Add shifted columns for "home", "team_opp", and "date"
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

# Drop rows where any of the next columns contain NaN values (optional)
#df = df.dropna(subset=["home_next", "team_opp_next", "date_next"])

# Optionally, save the DataFrame to a CSV file
#df.to_csv("D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\df_dropna_target_2.csv", index=False)
#df.to_csv("D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\df.csv", index=False)


# Display the first few rows to check the output
#print(df.head())

target_2_rows = df[df['target'] == 2]['target']
print(target_2_rows)


776      2
1649     2
2408     2
3151     2
3877     2
4681     2
5474     2
6288     2
7022     2
7869     2
8659     2
9438     2
10234    2
11008    2
11778    2
12601    2
13415    2
14178    2
14931    2
15685    2
16471    2
17220    2
18018    2
18801    2
19582    2
20322    2
21092    2
21906    2
22687    2
23447    2
Name: target, dtype: int32


In [12]:
for _, game in games_df.iterrows():
    home_team = game['home_team']
    away_team = game['away_team']
    game_day = game['game_date']
    
    print(home_team)
    print(away_team)
    print(game_day)


    last_home_team_index = df.loc[df['team'] == home_team].iloc[::-1].index[0]
    
    df.loc[last_home_team_index, 'team_opp_next'] = away_team
    df.loc[last_home_team_index, 'home_next'] = 1
    df.loc[last_home_team_index, 'date_next'] = game_day
    

    last_away_team_index = df.loc[df['team'] == away_team].iloc[::-1].index[0]
    
    df.loc[last_away_team_index, 'team_opp_next'] = home_team
    df.loc[last_away_team_index, 'home_next'] = 0
    df.loc[last_away_team_index, 'date_next'] = game_day


CHO
POR
2025-01-24
PHI
CLE
2025-01-24
MEM
NOP
2025-01-24


In [13]:
# Merging DataFrames
# Convert rolling_cols dictionary keys to a list and add other columns
full = df.merge(df[list(rolling_cols.keys()) + ["team_opp_next", "date_next", "team"]], 
                left_on=["team", "date_next"], 
                right_on=["team_opp_next", "date_next"])


# Save the merged DataFrame
output_path = "D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic\\full_new.csv"
full.to_csv(output_path, index=False)
#print(f"Merged data saved to: {output_path}")

# Display basic info and first few rows of the merged DataFrame
print("Full DataFrame Info:")
print(full.info())
print("\nFirst few rows of the merged DataFrame:")
print(full.head())

# Print number of rows in the merged DataFrame
num_rows = full.shape[0]
print(f"Number of rows in 'full' DataFrame: {num_rows}")

# Extract and print rows with target == 2
target_2_rows = full[full['target'] == 2]['target']
print("\nRows where 'target' == 2:")
print(target_2_rows)


Full DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23620 entries, 0 to 23619
Columns: 402 entries, fg_x to team_y
dtypes: bool(1), float64(392), int32(1), int64(1), object(7)
memory usage: 72.2+ MB
None

First few rows of the merged DataFrame:
       fg_x     fga_x     fg%_x      3p_x     3pa_x     3p%_x      ft_x  \
0  0.391304  0.323529  0.656339  0.275862  0.348485  0.351544  0.272727   
1  0.500000  0.338235  0.736429  0.344828  0.303030  0.495249  0.409091   
2  0.369565  0.338235  0.631584  0.275862  0.287879  0.413302  0.386364   
3  0.391304  0.411765  0.000208  0.241379  0.378788  0.286223  0.295455   
4  0.391304  0.441176  0.598091  0.241379  0.257576  0.395487  0.386364   

      fta_x     orb_x     drb_x  ...  stl%_max_opp_y  blk%_max_opp_y  \
0  0.234375  0.241379  0.386364  ...           0.034           0.068   
1  0.406250  0.241379  0.363636  ...           0.061           0.079   
2  0.343750  0.275862  0.477273  ...           0.047           0.045 

In [14]:
mask = full['date_next'] == game_day
filtered_df = full.loc[mask, ['team_x', 'team_opp_next_x', 'team_y', 'team_opp_next_y', 'date_next', 'home_next']]

print(filtered_df)

      team_x team_opp_next_x team_y team_opp_next_y   date_next  home_next
3908     CHO             POR    POR             CHO  2025-01-24        1.0
4717     CLE             PHI    PHI             CLE  2025-01-24        0.0
11871    MEM             NOP    NOP             MEM  2025-01-24        1.0
15044    NOP             MEM    MEM             NOP  2025-01-24        0.0
18144    PHI             CLE    CLE             PHI  2025-01-24        1.0
19723    POR             CHO    CHO             POR  2025-01-24        0.0


In [15]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns


In [16]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
selected_features = selected_columns.unique()

selected_features

Index(['fg_x', 'fga_x', 'fg%_x', '3p_x', '3pa_x', '3p%_x', 'ft_x', 'fta_x',
       'orb_x', 'drb_x',
       ...
       'trb%_max_opp_y', 'ast%_max_opp_y', 'stl%_max_opp_y', 'blk%_max_opp_y',
       'tov%_max_opp_y', 'usg%_max_opp_y', 'ortg_max_opp_y', 'drtg_max_opp_y',
       'total_opp_y', 'home_opp_y'],
      dtype='object', length=391)

In [17]:
full_train = full[full["target"] != 2]
full_pred = full[full["target"] == 2]

print(full_pred)

X = full_train[selected_features].values
y = full_train["target"].values


           fg_x     fga_x     fg%_x      3p_x     3pa_x     3p%_x      ft_x  \
3908   0.478261  0.382353  0.000291  0.379310  0.409091  0.000422  0.613636   
4717   0.478261  0.544118  0.000213  0.448276  0.560606  0.000376  0.295455   
11871  0.652174  0.558824  0.000325  0.724138  0.651515  0.000531  0.295455   
15044  0.478261  0.470588  0.000246  0.517241  0.560606  0.000435  0.590909   
18144  0.543478  0.529412  0.000264  0.517241  0.515152  0.000469  0.136364   
19723  0.456522  0.323529  0.000307  0.310345  0.363636  0.000381  0.272727   

          fta_x     orb_x     drb_x  ...  stl%_max_opp_y  blk%_max_opp_y  \
3908   0.500000  0.379310  0.136364  ...           0.082           0.077   
4717   0.296875  0.689655  0.204545  ...           0.066           0.161   
11871  0.218750  0.448276  0.250000  ...           0.047           0.079   
15044  0.453125  0.206897  0.522727  ...           0.108           0.123   
18144  0.125000  0.275862  0.022727  ...           0.075          

In [18]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 5, 10]
}

# Create a LightGBM classifier
base_estimator = lgb.LGBMClassifier(objective='binary',
                                     metric='auc',
                                     boosting_type='gbdt',
                                     verbosity=-1,
                                     random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=base_estimator,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

# Perform grid search
#grid_search.fit(X_train, y_train)

# Print the best parameters
#print("Best parameters found:", grid_search.best_params_)


In [20]:
#Best parameters found: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'num_leaves': 10}

params = {
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 10,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 10,
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'random_state': 42,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'max_depth': 7,
    'min_child_weight': 5
}

model = lgb.LGBMClassifier(**params)



In [21]:
# Train the model using X_train and y_train
model.fit(X_train, y_train)

# Predict the target values for the test set X_test
y_pred = model.predict(X_test)

# Check the accuracy of the model using the test set
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 61.72%


In [22]:
importances = model.feature_importances_

# create a dictionary to store feature importances with column names
feat_importances = dict(zip(selected_columns, importances))

# sort the dictionary by importance score in descending order
sorted_feat_importances = sorted(feat_importances.items(), key=lambda x: x[1], reverse=True)


# Print the sorted feature importances
for feature, importance in sorted_feat_importances[:30]:
    print("{}: {}".format(feature, importance))


drtg_7: 24
home_next: 20
ortg_7: 19
pts_max_7: 15
usg%_max_7: 13
usg%_max_y: 13
fta_max_7: 12
ft_opp_7: 11
blk_opp_7: 11
ortg_y: 11
trb%_max_7: 10
3p%_opp_7: 10
drtg_y: 10
pts_max_y: 10
3pa_7: 9
pts_opp_7: 9
drtg_max_opp_7: 9
blk%_x: 8
ortg_max_7: 8
fg_opp_7: 8
3pa_opp_7: 8
3par_max_y: 8
orb%_max_y: 8
pts_7: 7
orb%_7: 7
drtg_max_7: 7
stl%_y: 7
blk%_y: 7
ft_max_y: 7
fg_opp_y: 7


In [23]:
# predict on new data
full_pred["proba"] = model.predict_proba(full_pred[selected_features])[:,1]
full_pred["proba"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_pred["proba"] = model.predict_proba(full_pred[selected_features])[:,1]


3908     0.609328
4717     0.557961
11871    0.480711
15044    0.531160
18144    0.458300
19723    0.341123
Name: proba, dtype: float64

In [24]:
home_teams_prob = list(games_df['home_team'])
away_teams_prob = list(games_df['away_team'])

#print(home_teams_prob)
#print(away_teams_prob)

# Filter the rows where team_x is a home team
full_pred_prob = full_pred['team_x'].isin(home_teams_prob)
#print(full_pred_prob)

#full_pred_prob = full_pred['team_x'].isin(home_teams_prob)
full_pred[full_pred_prob]['proba']


3908     0.609328
11871    0.480711
18144    0.458300
Name: proba, dtype: float64

In [25]:
# Filter rows where full_pred_prob is True

team_x = full_pred.loc[full_pred_prob, 'team_x']
team_y = full_pred.loc[full_pred_prob, 'team_y']
#print(team_x)
#print(team_y)

team_pairs = pd.concat([team_x, team_y], axis=1)


print(team_pairs)

      team_x team_y
3908     CHO    POR
11871    MEM    NOP
18144    PHI    CLE


In [26]:
# Filter rows where full_pred_prob is True
home_team_preds = full_pred.loc[full_pred_prob, ['team_x', 'team_y', 'proba']]

# Rename the columns to match the format of the games_df dataframe
home_team_preds.columns = ['home_team', 'away_team', 'home_team_prob']#, 'odds', 'result']
home_team_preds['odds 1'] = 0
home_team_preds['odds 2'] = 0

home_team_preds['result'] = 0
#home_team_preds['hit'] = 0
home_team_preds['date'] = game_day

# Print the resulting dataframe
print((home_team_preds).to_string(index=False))

#print(combined_df.head(80).to_string(index=False))



home_team away_team  home_team_prob  odds 1  odds 2  result       date
      CHO       POR        0.609328       0       0       0 2025-01-24
      MEM       NOP        0.480711       0       0       0 2025-01-24
      PHI       CLE        0.458300       0       0       0 2025-01-24


In [27]:
# Specify the directory where you want to save the file
file_name = "nba_games_predict_" + today + ".csv"
print(file_name)

# Construct the full file path
full_path = os.path.join(directory_path, file_name)

# Check if the file already exists
if os.path.isfile(full_path):
    print(f"A file with the name '{file_name}' already exists in the directory '{directory_path}'.")
else:
    # If the file does not exist, save the DataFrame to the CSV file
    # Make sure 'home_team_preds' is your actual DataFrame variable
    # home_team_preds.to_csv(full_path, index=False)
    print(f"The file '{file_name}' does not exist and can be created.")
    home_team_preds.to_csv(f"{directory_path}\\{file_name}", index=False)


nba_games_predict_2025-01-24.csv
The file 'nba_games_predict_2025-01-24.csv' does not exist and can be created.


In [28]:
# Open folder using subprocess on Windows
if os.name == 'nt':
    subprocess.Popen(f'explorer {directory_path}')
print(directory_path)
file_path = directory_path + "/" + file_name

src_files = set(os.listdir(directory_path))
dst_files = set(os.listdir(dst_dir))

diff = src_files - dst_files

print('Files in source but not in destination:')


D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction
Files in source but not in destination:


In [29]:
if diff:
    file_to_copy = os.path.join(directory_path, diff.pop())
    shutil.copy2(file_to_copy, dst_dir)
    print('File copied successfully')
    print(dst_dir)
else:
    print('No files to copy')

File copied successfully
D:\_Laufwerk C\11. Sorare\NBA\2025\LightGBM
