In [1]:
#########################################################################################################################
# CALCUATE THE STATISTICS OF BETTING #

# Script 4 of 5
#Purpose: Merges actual outcomes with predicted results to evaluate betting performance.
#Output: Calculates overall and subset accuracies (e.g., home-favored vs. away-favored), updating a combined CSV.

# Ensure `_3. 03012025 lightgbm` is executed before running this script.
#########################################################################################################################

In [2]:
current_season = 2025

In [3]:
import pandas as pd
import os
import glob
import numpy as np
import logging
from datetime import datetime, timedelta
from itertools import product


In [4]:
# Current date in the format 'YYYYMMDD' as it seems to be the expected format in the filename
MAX_DAYS_BACK = 120  # Configurable range for searching files

today = datetime.now() - timedelta(days=0)
today_str = (today).strftime("%Y-%m-%d")
print(today_str)

# Calculate the date with a specific number of days back
days_back = 1  # Example: go back 1 day
date_str = (today - timedelta(days=days_back)).strftime("%Y-%m-%d")

# Output the formatted date
print(f"The calculated date string is: {date_str}")

yesterday = today - timedelta(days=1)
print(yesterday)



2025-03-09
The calculated date string is: 2025-03-08
2025-03-08 14:18:24.115476


In [5]:
directory_path = r'D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction'
print(directory_path)

#directory_path = "D:\\1. Python\\1. NBA Script\\2025\\Gathering_Data\\Whole_Statistic"
STAT_DIR = r'D:\1. Python\1. NBA Script\2025\Gathering_Data\Whole_Statistic'

# Use a wildcard to find all files in the directory
files = glob.glob(os.path.join(directory_path, "*"))  # "*" will match any file in the directory

target_folder = r'D:\1. Python\1. NBA Script\2025\Gathering_Data\Next_Game'


D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction


In [6]:
# Function to check if file exists
def file_exists(date_str, file_path):
    filename = f"nba_games_predict_{date_str}.csv"
    return os.path.isfile(os.path.join(file_path, filename))

# Example variables (make sure they are defined before the loop)
MAX_DAYS_BACK = 120
#today = datetime.now()  # e.g. 2025-02-20
days_back = 0
file_found = False

while not file_found and days_back <= MAX_DAYS_BACK:
    # Recalculate the date string on *every* loop iteration
    date_to_check = yesterday - timedelta(days=days_back)
    date_str = date_to_check.strftime("%Y-%m-%d")

    print(f"Checking date: {date_str}")

    if file_exists(date_str, directory_path):
        file_found = True
        print(f"The file for {date_str} exists.")
        predict_file = glob.glob(os.path.join(directory_path, f'nba_games_predict_{date_str}.csv'))
        break  # Exit the while loop
    else:
        days_back += 1

if not file_found:
    print("No file found in the last 120 days.")

if predict_file:
    predict_file_path = predict_file[0]  # Get the first (and presumably only) file path
    last_prediction = (yesterday - timedelta(days=days_back)).strftime("%Y-%m-%d")
    
    # Further processing with predict_file_path
    print(f"File found: {predict_file_path}")
else:
    print(f"No prediction file found for {yesterday}")

#print(predict_file)

Checking date: 2025-03-08
The file for 2025-03-08 exists.
File found: D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction\nba_games_predict_2025-03-08.csv


In [7]:
# # Function to check if file exists
# def file_exists(date_str, file_path):
#     filename = f"nba_games_predict_{date_str}.csv"
#     return os.path.isfile(os.path.join(file_path, filename))

# file_found = False

# # Loop to find the most recent file
# while not file_found and days_back <= MAX_DAYS_BACK:  # Limit the search to 120 days back
#     print(date_str)
#     print(directory_path)

#     if file_exists(date_str, directory_path):
#         file_found = True
#         print(f"The file for {date_str} exists.")
#         predict_file = glob.glob(os.path.join(directory_path, f'nba_games_predict_{date_str}.csv'))
#         print(predict_file)

#     else:
#         days_back += 1

# if not file_found:
#     print("No file found in the last 120 days.")

# if predict_file:
#     predict_file_path = predict_file[0]  # Get the first (and presumably only) file path
#     last_prediction = (today - timedelta(days=days_back)).strftime("%Y-%m-%d")
    
#     # Further processing with predict_file_path
#     print(f"File found: {predict_file_path}")
# else:
#     print(f"No prediction file found for {yesterday}")

# #print(predict_file)

In [8]:
# Process the statistics file
if predict_file:
    # Read prediction file
    predict_file_df = pd.read_csv(predict_file[0], encoding="utf-7", decimal=",")

    # Assuming predict_file_df is your DataFrame
    columns_to_display = ['home_team', 'away_team', 'home_team_prob', 'odds 1', 'odds 2', 'result', 'date']
    #print(predict_file_df[columns_to_display])

    # Convert 'odds 1' and 'odds 2' from comma as decimal separator to period
    predict_file_df['odds 1'] = predict_file_df['odds 1'].astype(str).str.replace(',', '.').astype(float)
    predict_file_df['odds 2'] = predict_file_df['odds 2'].astype(str).str.replace(',', '.').astype(float)

    # File path for combined data
    combined_file_path = os.path.join(directory_path, f'combined_nba_predictions_acc_{last_prediction}.csv')

    try:
        # Attempt to read the combined file
        #combined_df = pd.read_csv(combined_file_path)
        combined_df = pd.read_csv(combined_file_path, encoding="utf-7", decimal=",")

    except FileNotFoundError:
        # If not found, initialize an empty DataFrame
        combined_df = pd.DataFrame()

    # Append new data to the combined DataFrame
    predict_file_df['accuracy'] = np.nan  # Add 'accuracy' column with NaN
    combined_df = pd.concat([combined_df, predict_file_df], ignore_index=True)

     # Sort the DataFrame by date
    combined_df = combined_df.sort_values(by='date', ascending=False)
    #print(combined_df)

    # Select only the desired columns
    combined_df = combined_df[columns_to_display]
    print(combined_df.head(10))
    
    print(f"Combined predictions updated")
else:
    print(f"No prediction file found for {yesterday}.")


    home_team away_team       home_team_prob odds 1 odds 2 result        date
948       TOR       WAS   0.5528273083194465   1.61    2.4      0  2025-03-08
947       MIL       ORL   0.6704824567006863   1.37    3.2      0  2025-03-08
946       MIA       CHI   0.5453130023651898   1.54   2.55      0  2025-03-08
945       HOU       NOP   0.5131277356992314    1.3   3.65      0  2025-03-08
944       GSW       DET   0.6909174968238082   1.41    3.0      0  2025-03-08
943       CHO       BRK  0.40611853047885654   2.55   1.54      0  2025-03-08
942       BOS       LAL   0.6372584757732348   1.36   3.25      0  2025-03-08
941       ATL       IND    0.565257434407426   2.25   1.67      0  2025-03-08
1         SAC       SAS     0.65396766972042   1.38    3.1    SAC  2025-03-07
0         TOR       UTA   0.5753971743553145   1.44   2.85    TOR  2025-03-07
Combined predictions updated


In [9]:
# Check for the most actual file with the game statistics
print(date_str)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to check if file exists
def file_exists(date_str):
    """Check if a specific file exists based on the date string."""
    filename = f"nba_games_{date_str}.csv"
    return os.path.isfile(os.path.join(STAT_DIR, filename))

# Function to find the most recent file
def find_most_recent_file(max_days=MAX_DAYS_BACK):
    """Find the most recent file within a specified number of days."""
    days_back = 0
    #today = datetime.date.today()

    while days_back <= max_days:
        most_recent_date = (today - timedelta(days=days_back)).strftime("%Y-%m-%d")
        if file_exists(most_recent_date):
            logging.info(f"The file for {most_recent_date} exists.")
            return most_recent_date
        else:
            days_back += 1
            #return days_back

    logging.warning("No file found within the specified range.")
    return None

# Main script execution
try:
    most_recent_date = find_most_recent_file()
    print(most_recent_date)

except FileNotFoundError:
    logging.error("File or directory not found.")
except IOError:
    logging.error("Error accessing file.")
except Exception as e:
    logging.error(f"An unexpected error occurred: {e}")


2025-03-09 14:18:24,461 - INFO - The file for 2025-03-09 exists.


2025-03-08
2025-03-09


In [10]:
# Update the betting overview and check the accuracy of prediction
season_2025_df = combined_df.copy()  

# Assuming last_prediction is a string in 'YYYY-MM-DD' format
print(most_recent_date)
#print(season_2025_df['date'].head(50))

daily_games_df = pd.read_csv(os.path.join(STAT_DIR, f"nba_games_{most_recent_date}.csv"))
#print(daily_games_df)

# Filter data for the 2025 season
season_2025_df = combined_df.copy()
daily_games_df = daily_games_df[daily_games_df['season'] == current_season].copy()

season_2025_df['date'] = pd.to_datetime(season_2025_df['date'], errors='coerce')
daily_games_df['date'] = pd.to_datetime(daily_games_df['date'], errors='coerce')


# Iterate over each row in the daily game data and update the result column in the combined data
for _, row in daily_games_df.iterrows():
    date = row['date']
    #print(date)
    winning_team = row['team'] if row['won'] == 1 else None
    #print(winning_team)
    
    # Update the 'result' column for the corresponding date and teams in the 2024 season
    if winning_team:
        mask = (season_2025_df['date'] == date) & (
            (season_2025_df['home_team'] == winning_team) | (season_2025_df['away_team'] == winning_team)
        )
        
        season_2025_df.loc[mask, 'result'] = winning_team
        #print(season_2025_df)
        
# Ensure that 'home_team_prob' is numeric
#print(season_2025_df.head(20))
season_2025_df['home_team_prob'] = pd.to_numeric(season_2025_df['home_team_prob'], errors='coerce')

# Check for any invalid values after conversion
if season_2025_df['home_team_prob'].isnull().any():
    print("Warning: Some values in 'home_team_prob' could not be converted to numeric and have been set to NaN.")

# Ensure 'result', 'home_team', and 'away_team' columns are strings for comparison
season_2025_df['result'] = season_2025_df['result'].astype(str)
season_2025_df['home_team'] = season_2025_df['home_team'].astype(str)
season_2025_df['away_team'] = season_2025_df['away_team'].astype(str)

# Create conditions for correct predictions
home_team_correct = (season_2025_df['home_team_prob'] >= 0.5) & (season_2025_df['result'] == season_2025_df['home_team'])
away_team_correct = (season_2025_df['home_team_prob'] < 0.5) & (season_2025_df['result'] == season_2025_df['away_team'])

# Calculate accuracy for each row
season_2025_df['accuracy'] = (home_team_correct | away_team_correct).astype(int)

# Overall Accuracy
overall_accuracy = season_2025_df['accuracy'].mean()
print(f'Overall Accuracy: {overall_accuracy:.2%}')

# Filter the DataFrame for specific subsets
subset_df = season_2025_df[(season_2025_df['home_team_prob'] <= 0.400)]
subset_df_home = season_2025_df[(season_2025_df['home_team_prob'] > 0.6)]

# Calculate accuracy for the subsets
subset_accuracy = subset_df['accuracy'].mean()
subset_accuracy_home = subset_df_home['accuracy'].mean()

print(f'Accuracy for home_team_prob above 0.60: {subset_accuracy_home:.2%}')
print(f'Accuracy for home_team_prob under 0.40 (away team wins): {subset_accuracy:.2%}')


# Save the updated DataFrame
#today = datetime.strptime(date_str, "%Y-%m-%d")  # Ensure `today` is in the correct datetime format
#print(date_str)

save_file_path = os.path.join(directory_path, f'combined_nba_predictions_acc_{today_str}.csv')
print(save_file_path)

# Drop unnecessary columns if they exist
season_2025_df.drop(columns=['Unnamed: 8'], errors='ignore', inplace=True)
season_2025_df.dropna(inplace=True)

# Save the final DataFrame
season_2025_df.to_csv(save_file_path, index=False)


2025-03-09
Overall Accuracy: 59.96%
Accuracy for home_team_prob above 0.60: 63.91%
Accuracy for home_team_prob under 0.40 (away team wins): 67.36%
D:\1. Python\1. NBA Script\2025\LightGBM\1. 2025_Prediction\combined_nba_predictions_acc_2025-03-09.csv


In [11]:
###############################################################################################################################################

In [12]:
###############################################################################################################################################