In [31]:
import pandas as pd
import numpy as np
import warnings
import json
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from IPython.display import Javascript
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'''))
#from skopt import BayesSearchCV
import os

def download_current_data(date=None):
    # Define the base URL and parameters
    base_url = "https://www.bigdataball.com/wp-admin/admin-ajax.php?action=outofthebox-download"
    account_id = "dbid:AADL0JM6TbjOPoH-7_QmtAYk4iT4-vis0Tk"
    listtoken = "5a58bb7418a59d0ec0a5558a510e959d"

    # Get current date in the required format
    current_date = datetime.now()
    yesterday = current_date - timedelta(1)
    current_date = yesterday.strftime("%m-%d-%Y") if date == None else date
    filename = f"{current_date}-nba-season-team-feed.xlsx"
    outofthebox_path = f"%2F{filename}"

    # Construct the full URL
    full_url = f"{base_url}&OutoftheBoxpath={outofthebox_path}&lastpath=%2F&account_id={account_id}&listtoken={listtoken}&dl=1"

    # Directory to save the file
    save_dir = "./"
    save_path = os.path.join(save_dir, filename)
    print(save_path)

    # don't redownload if we already have it
    if os.path.exists(save_path):
        return filename

    # Use curl to download the file
    response = requests.get(full_url, stream=True)
    print(response.status_code)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    return filename

column_mappings = {
    "COLS": ['GAME-ID', 'DATE', 'TEAM', '1Q', '2Q', '3Q', '4Q', 'F', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR', 'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'POSS', 'PACE', 'OEFF', 'DEFF', 'TEAM_REST_DAYS', 'MAIN REF', 'CREW', 'OPENING ODDS', 'OPENING SPREAD', 'OPENING TOTAL', 'CLOSING_ODDS', 'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'HALFTIME'],
    "cleaned_cols": ['Dataset', 'GAME-ID', 'DATE', 'TEAM', 'VENUE', '1Q', '2Q', '3Q', '4Q', 'OT1', 'OT2', 'OT3', 'OT4', 'OT5', 'F', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR', 'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'TO_TO', 'BL', 'PTS', 'POSS', 'PACE', 'OEFF', 'DEFF', 'TEAM_REST_DAYS', 'STARTER_1', 'STARTER_2', 'STARTER_3', 'STARTER_4', 'STARTER_5', 'MAIN REF', 'CREW', 'OPENING ODDS', 'OPENING SPREAD', 'OPENING TOTAL', 'LINE_MOVEMENT_1', 'LINE_MOVEMENT_2', 'LINE_MOVEMENT_3', 'CLOSING_ODDS', 'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'HALFTIME', 'BOX_SCORE_URL', 'ODDS_URL', 'BIGDATABALL_DATASET', 'FULL_GAME_ODDS_URL', 'CREW_CHIEF', 'REFEREE_UMPIRE'],
    "t_cleaned_cols": ['BIGDATABALL_DATASET', 'GAME-ID', 'DATE', 'TEAM', 'VENUE', '1Q', '2Q', '3Q', '4Q', 'OT1', 'OT2', 'OT3', 'OT4', 'OT5', 'F', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR', 'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'TO_TO', 'BL', 'PTS', 'POSS', 'PACE', 'OEFF', 'DEFF', 'TEAM_REST_DAYS', 'STARTER_1', 'STARTER_2', 'STARTER_3', 'STARTER_4', 'STARTER_5', 'MAIN REF', 'CREW', 'OPENING_ODDS', 'OPENING_SPREAD', 'OPENING_TOTAL', 'LINE_MOVEMENT_1', 'LINE_MOVEMENT_2', 'LINE_MOVEMENT_3', 'CLOSING_ODDS', 'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'HALFTIME', 'BOX_SCORE_URL', 'FULL_GAME_ODDS_URL'],
    "TRAIN_COLS": ['Offensive_Rating', 'Defensive_Rating', 'Opp_Offensive_Rating', 
                   'Opp_Defensive_Rating','Offensive_Var', 'Defensive_Var', 
                   'Opp_Offensive_Var', 'Opp_Defensive_Var', 'Opp_Elo', 
                   'Opp_Momentum', 'Opp_Avg_3_game_DEFF', 'Opp_Avg_5_game_DEFF', 
                   'Opp_Season_Avg_DEFF', 'Opp_Avg_3_game_OEFF', 'Opp_Avg_5_game_OEFF', 
                   'Opp_Season_Avg_OEFF', 'Opp_Avg_3_game_PACE', 'Opp_Avg_5_game_PACE', 
                   'Opp_Season_Avg_PACE', 'Opp_Avg_3_game_POSS', 'Opp_Avg_5_game_POSS', 
                   'Opp_Season_Avg_POSS', 'Avg_3_game_DEFF', 'Avg_5_game_DEFF', 'Season_Avg_DEFF', 
                   'Avg_3_game_OEFF', 'Avg_5_game_OEFF', 'Season_Avg_OEFF', 'Avg_3_game_PACE', 
                   'Avg_5_game_PACE', 'Season_Avg_PACE', 'Avg_3_game_POSS', 'Avg_5_game_POSS', 
                   'Season_Avg_POSS', 'Avg_3_game_OR', 'Avg_5_game_OR', 'Season_Avg_OR', 
                   'Avg_3_game_3P', 'Avg_5_game_3P', 'Season_Avg_3P', 'Avg_3_game_3PA', 
                   'Avg_5_game_3PA', 'Season_Avg_3PA', 'Avg_3_game_TO', 'Avg_5_game_TO', 
                   'Season_Avg_TO', 'Avg_3_game_FT', 'Avg_5_game_FT', 'Season_Avg_FT', 
                   'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'Avg_3_game_PTS', 
                   'Avg_5_game_PTS', 'Season_Avg_PTS', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3',
                     'VENUE', 'TEAM', 'Opponent', 'Win_Loss_Diff', 'Elo_Rating', 
                     'Momentum'],

    "today_mappings": ['Last_ML_1'],
    "TARGET": 'Result',
    "SEASON_MAP": {'NBA 2021-2022 Regular Season': 2022, 'NBA 2020-2021 Regular Season': 2021, 'NBA 2019-2020 Regular Season': 2020, 'NBA 2022-2023 Regular Season': 2023, 'NBA 2020 Playoffs': 2020, 'NBA 2021 Play-in': 2021, 'NBA 2021 Playoffs': 2021, 'NBA 2023 Play-In': 2023, 'NBA 2022 Play-In': 2022, 'NBA 2023 Playoffs': 2023, 'NBA 2022 Playoffs': 2022, 'NBA 2023-2024 Regular Season': 2024, 'NBA 2023 In-Season Tournament': 2024},
    "t_train_cols": ['Opp_Elo', 'Opp_Momentum', 'SPREAD_LINE_MOVEMENT_1', 'SPREAD_LINE_MOVEMENT_2', 'SPREAD_LINE_MOVEMENT_3', 'TOTAL_LINE_MOVEMENT_1', 'TOTAL_LINE_MOVEMENT_2', 'TOTAL_LINE_MOVEMENT_3', 'CREW', 'Opp_Avg_3_game_DEFF', 'Opp_Avg_5_game_DEFF', 'Opp_Season_Avg_DEFF', 'Opp_Avg_3_game_OEFF', 'Opp_Avg_5_game_OEFF', 'Opp_Season_Avg_OEFF', 'Opp_Avg_3_game_PACE', 'Opp_Avg_5_game_PACE', 'Opp_Season_Avg_PACE', 'Opp_Avg_3_game_POSS', 'Opp_Avg_5_game_POSS', 'Opp_Season_Avg_POSS','Avg_3_game_DEFF', 'Avg_5_game_DEFF', 'Season_Avg_DEFF', 'Avg_3_game_OEFF', 'Avg_5_game_OEFF', 'Season_Avg_OEFF', 'Avg_3_game_PACE', 'Avg_5_game_PACE', 'Season_Avg_PACE', 'Avg_3_game_POSS', 'Avg_5_game_POSS', 'Season_Avg_POSS', 'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'Avg_3_game_PTS', 'Avg_5_game_PTS', 'Season_Avg_PTS', 'Streak', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3', 'VENUE', 'TEAM', 'Opponent', 'Win_Loss_Diff', 'HOME TEAM WIN%', 'HOME TEAM POINTS DIFFERENTIAL', 'TOTAL POINTS PER GAME', 'CALLED FOULS PER GAME', 'FOUL% AGAINST ROAD TEAMS', 'FOUL% AGAINST HOME TEAMS', 'FOUL DIFFERENTIAL (Against Road Team) - (Against Home Team)', 'Elo_Rating', 'Momentum', 'MAIN REF', 'TEAM_REST_DAYS'],
    "train_cols_final": ['Opp_Elo', 'Opp_Momentum', 'SPREAD_LINE_MOVEMENT_1', 'SPREAD_LINE_MOVEMENT_2', 'SPREAD_LINE_MOVEMENT_3', 'TOTAL_LINE_MOVEMENT_1', 'TOTAL_LINE_MOVEMENT_2', 'TOTAL_LINE_MOVEMENT_3', 'CREW', 'Opp_Avg_3_game_DEFF', 'Opp_Avg_5_game_DEFF', 'Opp_Season_Avg_DEFF', 'Opp_Avg_3_game_OEFF', 'Opp_Avg_5_game_OEFF', 'Opp_Season_Avg_OEFF', 'Opp_Avg_3_game_PACE', 'Opp_Avg_5_game_PACE', 'Opp_Season_Avg_PACE', 'Opp_Avg_3_game_POSS', 'Opp_Avg_5_game_POSS', 'Opp_Season_Avg_POSS','Avg_3_game_DEFF', 'Avg_5_game_DEFF', 'Season_Avg_DEFF', 'Avg_3_game_OEFF', 'Avg_5_game_OEFF', 'Season_Avg_OEFF', 'Avg_3_game_PACE', 'Avg_5_game_PACE', 'Season_Avg_PACE', 'Avg_3_game_POSS', 'Avg_5_game_POSS', 'Season_Avg_POSS', 'CLOSING_SPREAD', 'CLOSING_TOTAL', 'MONEYLINE', 'Avg_3_game_PTS', 'Avg_5_game_PTS', 'Season_Avg_PTS', 'Streak', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3', 'VENUE', 'TEAM', 'Opponent', 'Win_Loss_Diff', 'HOME TEAM WIN%', 'HOME TEAM POINTS DIFFERENTIAL', 'TOTAL POINTS PER GAME', 'CALLED FOULS PER GAME', 'FOUL% AGAINST ROAD TEAMS', 'FOUL% AGAINST HOME TEAMS', 'FOUL DIFFERENTIAL (Against Road Team) - (Against Home Team)', 'Elo_Rating', 'Momentum', 'MAIN REF', 'TEAM_REST_DAYS']
}

team_map = {
    'Hawks': 'Atlanta',
    'Nets': 'Brooklyn',
    'Celtics': 'Boston',
    'Hornets': 'Charlotte',
    'Bulls': 'Chicago',
    'Cavaliers': 'Cleveland',
    'Mavericks': 'Dallas',
    'Nuggets': 'Denver',
    'Pistons': 'Detroit',
    'Warriors': 'Golden State',
    'Rockets': 'Houston',
    'Pacers': 'Indiana',
    'Clippers': 'LA Clippers',
    'Lakers': 'LA Lakers',
    'Grizzlies': 'Memphis',
    'Heat': 'Miami',
    'Bucks': 'Milwaukee',
    'Timberwolves': 'Minnesota',
    'Pelicans': 'New Orleans',
    'Knicks': 'New York',
    'Thunder': 'Oklahoma City',
    'Magic': 'Orlando',
    '76ers': 'Philadelphia',
    'Suns': 'Phoenix',
    'Trail Blazers': 'Portland',
    'Kings': 'Sacramento',
    'Spurs': 'San Antonio',
    'Raptors': 'Toronto',
    'Jazz': 'Utah',
    'Wizards': 'Washington'
}
# To save this as a JSON object, you can use the following code


with open('column_mappings.json', 'w') as file:
    json.dump(column_mappings, file)

with open('column_mappings.json', 'r') as file:
    column_mappings = json.load(file)

# Now, you can access your column lists like this
COLS = column_mappings['COLS']
cleaned_cols = column_mappings['cleaned_cols']
t_cleaned_cols = column_mappings['t_cleaned_cols']
TRAIN_COLS = column_mappings['TRAIN_COLS']
today_mappings = column_mappings['today_mappings']
TARGET = column_mappings['TARGET']
SEASON_MAP = column_mappings['SEASON_MAP']
t_train_cols = column_mappings['t_train_cols']
train_cols_final = column_mappings['train_cols_final']

warnings.filterwarnings('ignore')

<IPython.core.display.Javascript object>

In [20]:
def american_odds_to_probability(odds):
    if odds > 0:
        probability = 100 / (odds + 100)
    else:
        probability = -odds / (-odds + 100)
    return probability

def calculate_profit(odds, size):
    if odds > 0:
        profit = (odds / 100) * size
    else:
        profit = (100 / -(odds + 0.0000001)) * size
    return profit

def kelly_criterion(bankroll, probability, odds, temper=1):
    """
    Calculate the optimal bet size using the Kelly Criterion.

    :param bankroll: Total amount of money you have to bet with.
    :param probability: The probability of the bet winning (from 0 to 1).
    :param odds: The odds being offered on the bet (in decimal format).
    :return: The recommended bet size according to the Kelly Criterion.
    """
    # Convert American odds to decimal if necessary
    if odds > 0:
        odds = (odds / 100) + 1
    elif odds < 0:
        odds = (100 / -odds) + 1

    # Calculate the Kelly bet fraction
    b = odds - 1  # Decimal odds minus 1
    q = 1 - probability  # Probability of losing
    kelly_fraction = (b * probability - q) / b

    # Calculate the recommended bet
    recommended_bet = (temper * kelly_fraction) * bankroll

    return recommended_bet

def combine_parlay_odds(odds_list):
    total_multiplier = 1
    for odds in odds_list:
        if odds > 0:  # Positive odds
            total_multiplier *= (odds / 100) + 1
        else:  # Negative odds
            total_multiplier *= 1 - (100 / (odds + 0.0000001))

    # Calculate parlay odds
    if total_multiplier >= 2:
        parlay_odds = (total_multiplier - 1) * 100
    else:
        parlay_odds = -100 / ((total_multiplier - 1) + 0.00000001)
    return parlay_odds

def get_top_bottom_features_with_scores(pred_contributions, feature_names):
    # Pair feature names with their contributions
    feature_contributions = zip(feature_names, pred_contributions)

    # Sort by the absolute value of contributions but keep the original sign
    sorted_features = sorted(feature_contributions, key=lambda x: abs(x[1]), reverse=True)

    # Get top 5 and bottom 5 features based on absolute value
    top_5_features = sorted_features[:5]
    bottom_5_features = sorted_features[-5:]

    return top_5_features, bottom_5_features

    return round(parlay_odds)
def print_wrapper(func):
    ansi_reset = '\033[0m'
    ansi_black = '\033[90m'
    ansi_red = '\033[91m'
    ansi_green = '\033[92m'
    ansi_yellow = '\033[93m'
    ansi_blue = '\033[94m'
    ansi_pink = '\033[95m'
    ansi_teal = '\033[96m'
    ansi_gray = '\033[97m'
    ansi_warning = '\033[31;1;4m'
    ansi_error = '\033[31;100m'
    def wrapped_func(*args,**kwargs):
        new_args = args + tuple()
        new_kwargs = kwargs.copy()
        for kwarg, kwvalue in kwargs.items(): # Loop through the keyword arguments
            if kwarg == "color":
                if kwvalue == "black":
                    color = ansi_black
                elif kwvalue == "red":
                    color = ansi_red
                elif kwvalue == "green":
                    color = ansi_green
                elif kwvalue == "yellow":
                    color = ansi_yellow
                elif kwvalue == "blue":
                    color = ansi_blue
                elif kwvalue == "pink":
                    color = ansi_pink
                elif kwvalue == "teal":
                    color = ansi_teal
                elif kwvalue == "gray":
                    color = ansi_gray
                elif kwvalue == "warning":
                    color = ansi_warning
                elif kwvalue == "error":
                    color = ansi_error
                new_kwargs = kwargs.copy() # Make a copy of the keyword arguments dict
                del new_kwargs["color"] # Remove color from the keyword arguments dict
        try: # Is the variable color defined?
            color
        except NameError:
            pass
            # no color was specified
        else:
            new_args = ()
            for arg in args:
                new_args += (f"{color}{arg}{ansi_reset}",) # Apply the ANSI escape codes to each non-keyword argument
        return func(*new_args,**new_kwargs)
    return wrapped_func

print = print_wrapper(print) # Apply the wrapper to the print() function

def probability_to_american_odds(probability):
    if probability < 0 or probability > 1:
        raise ValueError("Probability must be between 0 and 1")

    if probability == 0.5:
        return 100  # Even odds

    if probability > 0.5:
        return int(-100 * (probability / (1 - probability)))
    else:
        return int(100 * ((1 - probability) / probability))

def odds_to_str(odds):
  if odds <= 0:
    return odds
  else:
    return f'+{odds}'
team_strengths = {}
team_momentum = {}

def update_off_def_ratings(row, ratings, momentum_scores):
    K = 20  # K-factor for Elo adjustment
    m = 1.2  # Momentum factor
    momentum_decay = 0.1  # Decay factor for momentum

    team = row['TEAM']
    opponent = row['Opponent']

    # Get team and opponent ratings
    team_offense = ratings[team]['offense']
    team_defense = ratings[team]['defense']
    opponent_offense = ratings[opponent]['offense']
    opponent_defense = ratings[opponent]['defense']

    tm, om = momentum_scores[team], momentum_scores[opponent]

    # Expected points scored and allowed
    expected_points_scored = (team_offense + opponent_defense) / 2
    expected_points_allowed = (opponent_offense + team_defense) / 2

    # Actual points scored and allowed
    actual_points_scored = row['actual_points_scored']
    actual_points_allowed = row['actual_points_allowed']

    # Update offensive rating
    ratings[team]['offense'] += K * (actual_points_scored - expected_points_scored) / 100
    ratings[opponent]['defense'] += K * (actual_points_scored - expected_points_scored) / 100

    # Update defensive rating
    ratings[team]['defense'] += K * (expected_points_allowed - actual_points_allowed) / 100
    ratings[opponent]['offense'] += K * (expected_points_allowed - actual_points_allowed) / 100

    # Update momentum
    team_elo_diff = abs(ratings[team]['offense'] - ratings[opponent]['offense'])
    if actual_points_scored > expected_points_scored:
        momentum_scores[team] += momentum_decay * (tm + team_elo_diff)
    else:
        momentum_scores[team] -= momentum_decay * (tm - (team_elo_diff / m))

    return ratings[team], ratings[opponent], momentum_scores[team], momentum_scores[opponent]

def update_bayesian_off_def(row):
    # Parameters
    sigma_prior = 200.0  # Initial standard deviation for ratings
    sigma_obs = 15.0     # Standard deviation of observed points
    momentum_decay = 0.9  # Decay factor for momentum

    team = row['TEAM']
    opponent = row['Opponent']

    # Initialize offensive and defensive ratings if not already done
    if team not in team_ratings:
        team_ratings[team] = {
            'offense': {'mu': 1500.0, 'sigma2': sigma_prior ** 2},
            'defense': {'mu': 1500.0, 'sigma2': sigma_prior ** 2},
        }
        team_momentum[team] = 0.0
    if opponent not in team_ratings:
        team_ratings[opponent] = {
            'offense': {'mu': 1500.0, 'sigma2': sigma_prior ** 2},
            'defense': {'mu': 1500.0, 'sigma2': sigma_prior ** 2},
        }
        team_momentum[opponent] = 0.0

    # Get offensive and defensive strengths
    team_off_mu, team_off_sigma2 = team_ratings[team]['offense']['mu'], team_ratings[team]['offense']['sigma2']
    team_def_mu, team_def_sigma2 = team_ratings[team]['defense']['mu'], team_ratings[team]['defense']['sigma2']
    opp_off_mu, opp_off_sigma2 = team_ratings[opponent]['offense']['mu'], team_ratings[opponent]['offense']['sigma2']
    opp_def_mu, opp_def_sigma2 = team_ratings[opponent]['defense']['mu'], team_ratings[opponent]['defense']['sigma2']

    # Observed points scored by each team
    y_team_scored = row['F']
    y_opp_scored = row['actual_points_allowed']

    # Offensive update: points scored vs opponent defense
    mu_team_scored_prior = (team_off_mu + opp_def_mu) / 2
    sigma2_team_scored_prior = (team_off_sigma2 + opp_def_sigma2 + sigma_obs ** 2) / 2

    mu_team_scored_post = (
        (mu_team_scored_prior / sigma2_team_scored_prior + y_team_scored / sigma_obs ** 2)
        / (1 / sigma2_team_scored_prior + 1 / sigma_obs ** 2)
    )
    sigma2_team_scored_post = 1 / (1 / sigma2_team_scored_prior + 1 / sigma_obs ** 2)

    delta_off = mu_team_scored_post - mu_team_scored_prior

    team_ratings[team]['offense']['mu'] += delta_off * (team_off_sigma2 / (team_off_sigma2 + opp_def_sigma2))
    team_ratings[opponent]['defense']['mu'] += delta_off * (opp_def_sigma2 / (team_off_sigma2 + opp_def_sigma2))

    # Defensive update: points allowed vs opponent offense
    mu_team_allowed_prior = (opp_off_mu + team_def_mu) / 2
    sigma2_team_allowed_prior = (opp_off_sigma2 + team_def_sigma2 + sigma_obs ** 2) / 2

    mu_team_allowed_post = (
        (mu_team_allowed_prior / sigma2_team_allowed_prior + y_opp_scored / sigma_obs ** 2)
        / (1 / sigma2_team_allowed_prior + 1 / sigma_obs ** 2)
    )
    sigma2_team_allowed_post = 1 / (1 / sigma2_team_allowed_prior + 1 / sigma_obs ** 2)

    delta_def = mu_team_allowed_post - mu_team_allowed_prior

    team_ratings[team]['defense']['mu'] += delta_def * (team_def_sigma2 / (team_def_sigma2 + opp_off_sigma2))
    team_ratings[opponent]['offense']['mu'] += delta_def * (opp_off_sigma2 / (team_def_sigma2 + opp_off_sigma2))

    # Update variances
    team_ratings[team]['offense']['sigma2'] = sigma2_team_scored_post / 2
    team_ratings[opponent]['defense']['sigma2'] = sigma2_team_scored_post / 2
    team_ratings[team]['defense']['sigma2'] = sigma2_team_allowed_post / 2
    team_ratings[opponent]['offense']['sigma2'] = sigma2_team_allowed_post / 2

    # Update momentum
    team_momentum[team] = momentum_decay * team_momentum[team] + abs(delta_off + delta_def)
    team_momentum[opponent] = momentum_decay * team_momentum[opponent] + abs(delta_off + delta_def)

    return (
        team_ratings[team]['offense'], team_ratings[team]['defense'], team_momentum[team],
        team_ratings[opponent]['offense'], team_ratings[opponent]['defense'], team_momentum[opponent]
    )



def update_bayesian_elo_momentum(row):
    df = infer_df

    # Parameters
    sigma_prior = 200.0  # Initial standard deviation of team strengths
    sigma_obs = 15.0     # Standard deviation of the observed point differential
    momentum_decay = 0.9  # Decay factor for momentum

    team = row['TEAM']
    opponent = df[(df['GAME-ID'] == row['GAME-ID']) & (df['TEAM'] != team)]['TEAM'].values[0]

    # Initialize team strengths if not already done
    if team not in team_strengths:
        team_strengths[team] = {'mu': 1500.0, 'sigma2': sigma_prior ** 2}
        team_momentum[team] = 0.0
    if opponent not in team_strengths:
        team_strengths[opponent] = {'mu': 1500.0, 'sigma2': sigma_prior ** 2}
        team_momentum[opponent] = 0.0

    # Get team strengths
    team_mu, team_sigma2 = team_strengths[team]['mu'], team_strengths[team]['sigma2']
    opponent_mu, opponent_sigma2 = team_strengths[opponent]['mu'], team_strengths[opponent]['sigma2']

    # Observed outcome (point differential)
    y = row['F'] - df[(df['GAME-ID'] == row['GAME-ID']) & (df['TEAM'] != team)]['F'].values[0]

    # Prior difference in strengths
    mu_diff_prior = team_mu - opponent_mu
    sigma2_diff_prior = team_sigma2 + opponent_sigma2 + sigma_obs ** 2

    # Bayesian update
    # Compute the posterior mean and variance of the difference in team strengths
    mu_diff_post = (mu_diff_prior / sigma2_diff_prior + y / sigma_obs ** 2) / (1 / sigma2_diff_prior + 1 / sigma_obs ** 2)
    sigma2_diff_post = 1 / (1 / sigma2_diff_prior + 1 / sigma_obs ** 2)

    # Compute the update amounts
    delta_mu = mu_diff_post - mu_diff_prior

    # Update team strengths (split the update between both teams)
    team_strengths[team]['mu'] += delta_mu * (team_sigma2 / (team_sigma2 + opponent_sigma2))
    team_strengths[opponent]['mu'] -= delta_mu * (opponent_sigma2 / (team_sigma2 + opponent_sigma2))

    # Update variances (simplified for demonstration purposes)
    team_strengths[team]['sigma2'] = sigma2_diff_post / 2
    team_strengths[opponent]['sigma2'] = sigma2_diff_post / 2

    # Update momentum (momentum is proportional to the change in team strength)
    team_momentum[team] = momentum_decay * team_momentum[team] + abs(delta_mu)
    team_momentum[opponent] = momentum_decay * team_momentum[opponent] + abs(delta_mu)

    return (team_strengths[team]['mu'], team_strengths[team]['sigma2'], team_momentum[team],
            team_strengths[opponent]['mu'], team_strengths[opponent]['sigma2'], team_momentum[opponent])


def update_elo_momentum(row):
    df = infer_df
    #do_train = train

    K = 20  # K-factor in Elo rating
    m = 1.2 # m factor in momentup
    momentum_decay = 0.1  # Decay factor for momentum

    team = row['TEAM']
    opponent = df[(df['GAME-ID'] == row['GAME-ID']) & (df['TEAM'] != team)]['TEAM'].values[0]
    team_elo, opponent_elo = elo_ratings[team], elo_ratings[opponent]

    #if do_train:
    tm, om =  momentum_scores[team], momentum_scores[opponent]

    # Calculate expected outcomes
    expected_team = 1 / (1 + 10 ** ((opponent_elo - team_elo) / 400))

    # Actual outcome
    actual_team = row['spread_result']

    # Update Elo ratings
    elo_ratings[team] += K * (actual_team - expected_team)
    #elo_ratings[opponent] += K * ((1 - actual_team) - (1 - expected_team))

    # Calculate Elo difference
    elo_diff = abs(opponent_elo - team_elo)

    # Update momentum
    momentum_scores[team] = momentum_decay * (momentum_scores[team] + elo_diff) if actual_team == 1 else momentum_decay * (momentum_scores[team] - (elo_diff/m))

    return elo_ratings[team], momentum_scores[team], elo_ratings[opponent], momentum_scores[opponent]

def parse_referee_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', class_='table')
    rows = table.find_all('tr')[1:]  # Skipping the header row

    ref_data = {}
    for row in rows:
        columns = row.find_all('td')
        if len(columns) < 4:
            continue  # Skip rows that don't have enough columns

        game = columns[0].get_text(strip=True)
        crew_chief = columns[1].get_text(strip=True)
        referee = columns[2].get_text(strip=True)
        umpire = columns[3].get_text(strip=True)

        # Split the game into two teams
        teams = game.split(' @ ')
        if len(teams) != 2:
            continue  # Skip if format is not as expected
        city1 = teams[0] #_city_map.get(teams[0].split()[-1] + " " + teams[0].split()[-2], "Unknown")
        city2 = teams[1] #_city_map.get(teams[1].split()[-1] + " " + teams[1].split()[-2], "Unknown")

        ref_data[city1] = [crew_chief, referee, umpire]
        ref_data[city2] = [crew_chief, referee, umpire]

    return ref_data

def remove_ref_keys(data_dict):
    return {k: v for k, v in data_dict.items() if v[0] != 'REF'}

# Step 1: Result of the Game
def assign_results(group):
    #group['MAIN REF'] = [group['MAIN REF'].iloc[0]]*2
    #group['CREW'] = [group['CREW'].iloc[0]]*2

    t1_spread_f = group.iloc[0]['F'] + group.iloc[0]['CLOSING_SPREAD']
    t2_spread_f = group.iloc[1]['F'] + group.iloc[1]['CLOSING_SPREAD']

    min_spread_index = np.argmin([group.iloc[0]['CLOSING_SPREAD'], group.iloc[1]['CLOSING_SPREAD']])
    dog_spread_index = np.argmax([group.iloc[0]['CLOSING_SPREAD'], group.iloc[1]['CLOSING_SPREAD']])

    res = [t1_spread_f > group.iloc[1]['F'], t2_spread_f > group.iloc[0]['F']]

    group['spread_result'] = res
    group['ml_result'] = [group.iloc[0]['F'] > group.iloc[1]['F'], group.iloc[1]['F'] > group.iloc[0]['F']]
    #group['q3_result'] = [group.iloc[0]['1Q'] + group.iloc[0]['2Q'] + group.iloc[0]['3Q'] >= 100,
    #                       group.iloc[1]['1Q'] + group.iloc[1]['2Q'] + group.iloc[1]['3Q'] >= 100]
    group['total_result'] = [group.iloc[0]['F'] + group.iloc[1]['F'] >= group.iloc[0]['CLOSING_TOTAL'],
                              group.iloc[0]['F'] + group.iloc[1]['F'] >= group.iloc[0]['CLOSING_TOTAL']]

    # group['LINE_MOVEMENT_1'] = group['LINE_MOVEMENT_1'].str.split(' ').str.get(0).str.split('u').str.get(0).str.split('o').str.get(0)
    # group['LINE_MOVEMENT_1'] = pd.to_numeric(group['LINE_MOVEMENT_1'], errors='coerce')
    # group['SPREAD_LINE_MOVEMENT_1'] = group['LINE_MOVEMENT_1'][group['LINE_MOVEMENT_1'] < 100]
    # group['TOTAL_LINE_MOVEMENT_1'] = group['LINE_MOVEMENT_1'][group['LINE_MOVEMENT_1'] >= 100]


    # group['LINE_MOVEMENT_2'] = group['LINE_MOVEMENT_2'].str.split(' ').str.get(0).str.split('u').str.get(0).str.split('o').str.get(0)
    # group['LINE_MOVEMENT_2'] = pd.to_numeric(group['LINE_MOVEMENT_2'], errors='coerce')
    # group['SPREAD_LINE_MOVEMENT_2'] = group['LINE_MOVEMENT_2'][group['LINE_MOVEMENT_2'] < 100]
    # group['TOTAL_LINE_MOVEMENT_2'] = group['LINE_MOVEMENT_2'][group['LINE_MOVEMENT_2'] >= 100]

    # group['LINE_MOVEMENT_3'] = group['LINE_MOVEMENT_3'].str.split(' ').str.get(0).str.split('u').str.get(0).str.split('o').str.get(0)
    # group['LINE_MOVEMENT_3'] = pd.to_numeric(group['LINE_MOVEMENT_3'], errors='coerce')
    # group['SPREAD_LINE_MOVEMENT_3'] = group['LINE_MOVEMENT_3'][group['LINE_MOVEMENT_3'] < 100]
    # group['TOTAL_LINE_MOVEMENT_3'] = group['LINE_MOVEMENT_3'][group['LINE_MOVEMENT_3'] >= 100]


    group['Opp_Avg_3_game_PTS'] = [group.iloc[1]['Avg_3_game_PTS'], group.iloc[0]['Avg_3_game_PTS']]
    group['Opp_Avg_5_game_PTS'] = [group.iloc[1]['Avg_5_game_PTS'], group.iloc[0]['Avg_5_game_PTS']]
    group['Opp_Season_Avg_PTS'] = [group.iloc[1]['Season_Avg_PTS'], group.iloc[0]['Season_Avg_PTS']]

    group['Opp_Avg_3_game_POSS'] = [group.iloc[1]['Avg_3_game_POSS'], group.iloc[0]['Avg_3_game_POSS']]
    group['Opp_Avg_5_game_POSS'] = [group.iloc[1]['Avg_5_game_POSS'], group.iloc[0]['Avg_5_game_POSS']]
    group['Opp_Season_Avg_POSS'] = [group.iloc[1]['Season_Avg_POSS'], group.iloc[0]['Season_Avg_POSS']]

    group['Opp_Avg_3_game_PACE'] = [group.iloc[1]['Avg_3_game_PACE'], group.iloc[0]['Avg_3_game_PACE']]
    group['Opp_Avg_5_game_PACE'] = [group.iloc[1]['Avg_5_game_PACE'], group.iloc[0]['Avg_5_game_PACE']]
    group['Opp_Season_Avg_PACE'] = [group.iloc[1]['Season_Avg_PACE'], group.iloc[0]['Season_Avg_PACE']]

    group['Opp_Avg_3_game_DEFF'] = [group.iloc[1]['Avg_3_game_DEFF'], group.iloc[0]['Avg_3_game_DEFF']]
    group['Opp_Avg_5_game_DEFF'] = [group.iloc[1]['Avg_5_game_DEFF'], group.iloc[0]['Avg_5_game_DEFF']]
    group['Opp_Season_Avg_DEFF'] = [group.iloc[1]['Season_Avg_DEFF'], group.iloc[0]['Season_Avg_DEFF']]

    group['Opp_Avg_3_game_OEFF'] = [group.iloc[1]['Avg_3_game_OEFF'], group.iloc[0]['Avg_3_game_OEFF']]
    group['Opp_Avg_5_game_OEFF'] = [group.iloc[1]['Avg_5_game_OEFF'], group.iloc[0]['Avg_5_game_OEFF']]
    group['Opp_Season_Avg_OEFF'] = [group.iloc[1]['Season_Avg_OEFF'], group.iloc[0]['Season_Avg_OEFF']]

    group['Opp_Avg_3_game_OEFF'] = [group.iloc[1]['Avg_3_game_OEFF'], group.iloc[0]['Avg_3_game_OEFF']]
    group['Opp_Avg_5_game_OEFF'] = [group.iloc[1]['Avg_5_game_OEFF'], group.iloc[0]['Avg_5_game_OEFF']]
    group['Opp_Season_Avg_OEFF'] = [group.iloc[1]['Season_Avg_OEFF'], group.iloc[0]['Season_Avg_OEFF']]

    group['Opp_Avg_3_game_OR'] = [group.iloc[1]['Avg_3_game_OR'], group.iloc[0]['Avg_3_game_OR']]
    group['Opp_Avg_5_game_OR'] = [group.iloc[1]['Avg_5_game_OR'], group.iloc[0]['Avg_5_game_OR']]
    group['Opp_Season_Avg_OR'] = [group.iloc[1]['Season_Avg_OR'], group.iloc[0]['Season_Avg_OR']]

    group['Opp_Avg_3_game_3P'] = [group.iloc[1]['Avg_3_game_3P'], group.iloc[0]['Avg_3_game_3P']]
    group['Opp_Avg_5_game_3P'] = [group.iloc[1]['Avg_5_game_3P'], group.iloc[0]['Avg_5_game_3P']]
    group['Opp_Season_Avg_3P'] = [group.iloc[1]['Season_Avg_3P'], group.iloc[0]['Season_Avg_3P']]

    group['Opp_Avg_3_game_3PA'] = [group.iloc[1]['Avg_3_game_3PA'], group.iloc[0]['Avg_3_game_3PA']]
    group['Opp_Avg_5_game_3PA'] = [group.iloc[1]['Avg_5_game_3PA'], group.iloc[0]['Avg_5_game_3PA']]
    group['Opp_Season_Avg_3PA'] = [group.iloc[1]['Season_Avg_3PA'], group.iloc[0]['Season_Avg_3PA']]

    group['Opp_Avg_3_game_TO'] = [group.iloc[1]['Avg_3_game_TO'], group.iloc[0]['Avg_3_game_TO']]
    group['Opp_Avg_5_game_TO'] = [group.iloc[1]['Avg_5_game_TO'], group.iloc[0]['Avg_5_game_TO']]
    group['Opp_Season_Avg_TO'] = [group.iloc[1]['Season_Avg_TO'], group.iloc[0]['Season_Avg_TO']]

    group['Opp_Avg_3_game_FT'] = [group.iloc[1]['Avg_3_game_FT'], group.iloc[0]['Avg_3_game_FT']]
    group['Opp_Avg_5_game_FT'] = [group.iloc[1]['Avg_5_game_FT'], group.iloc[0]['Avg_5_game_FT']]
    group['Opp_Season_Avg_FT'] = [group.iloc[1]['Season_Avg_FT'], group.iloc[0]['Season_Avg_FT']]

    return group


def assign_opp_elo(group):
    group['Opp_Elo'] = [group.iloc[1]['Elo_Rating'], group.iloc[0]['Elo_Rating']]
    group['Opp_Momentum'] = [group.iloc[1]['Momentum'], group.iloc[0]['Momentum']]
    return group

# Step 3: Win/Loss Streak
def calculate_streak(group):
    streak = 0
    streaks = []
    for result in group['Prev_Result']:
        if result == 1:
            streak = streak + 1 if streak > 0 else 1
        else:
            streak = streak - 1 if streak < 0 else -1
        streaks.append(streak)
    group['Streak'] = streaks
    return group

In [3]:
yesterday = (datetime.now() - timedelta(1)).strftime("%m-%d-%Y")
two_days_ago = (datetime.now() - timedelta(2)).strftime("%m-%d-%Y")
TODAY_FILE = download_current_data(date=yesterday)

./01-02-2025-nba-season-team-feed.xlsx


In [18]:
df = pd.read_excel('2023-2024_CBB_Box_Score_Team-Stats.xlsx')
df2 = pd.read_excel('12-15-2024-cbb-season-team-feed.xlsx')

# concatenate the two dataframes
raw_df = pd.concat([df, df2])
#raw_df.columns = cleaned_cols

""" Add additional columns to games df """
#raw_df['Season'] = raw_df['BIGDATABALL_DATASET'].map(SEASON_MAP)
raw_df['DATE'] = raw_df['DATE'].astype('datetime64[ns]')

# replace spaces and newlines in column names with underscores
raw_df.columns = raw_df.columns.str.replace(' ', '_').str.replace('\n', '')
raw_df.columns = raw_df.columns.str.replace('#', '')

infer_df = raw_df.copy(deep=True)
infer_df.columns


Index(['BIGDATABALLDATASET', 'GAME-ID', 'DATE', 'TEAM',
       'ASSOCIATION_&DIVISION', 'CONFERENCE', 'ARENA&_STATE', 'VENUE', '1H',
       '2H', 'OTTOTAL', 'F', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA',
       'OR', 'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'POSS', 'PACE',
       'OEFF', 'DEFF', 'STARTING_LINEUPS', 'Unnamed:_33', 'Unnamed:_34',
       'Unnamed:_35', 'Unnamed:_36', 'OPENING_ODDS', 'OPENING_SPREAD',
       'OPENING_TOTAL', 'LINE_MOVEMENT_1', 'LINE_MOVEMENT_2',
       'LINE_MOVEMENT_3', 'CLOSINGODDS', 'CLOSING_SPREAD', 'CLOSING_TOTAL',
       'CLOSINGMONEYLINE', 'LINE_MOVEMENT1', 'LINE_MOVEMENT_2',
       'LINE_MOVEMENT_3'],
      dtype='object')

In [23]:
train_df = raw_df.copy(deep=True)

# Initialize Elo ratings and momentum scores
elo_ratings = {team: 1500 for team in train_df['TEAM'].unique()}
momentum_scores = {team: 0 for team in train_df['TEAM'].unique()}

# Average Points
train_df['Avg_3_game_PTS'] = train_df.groupby(['TEAM'])['PTS'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_PTS'] = train_df.groupby(['TEAM'])['PTS'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_PTS'] = train_df.groupby(['TEAM'])['PTS'].transform('mean')

# Average POSS
train_df['Avg_3_game_POSS'] = train_df.groupby(['TEAM'])['POSS'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_POSS'] = train_df.groupby(['TEAM'])['POSS'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_POSS'] = train_df.groupby(['TEAM'])['POSS'].transform('mean')

# Average PACE
train_df['Avg_3_game_PACE'] = train_df.groupby(['TEAM'])['PACE'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_PACE'] = train_df.groupby(['TEAM'])['PACE'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_PACE'] = train_df.groupby(['TEAM'])['PACE'].transform('mean')

# Average OEFF
train_df['Avg_3_game_OEFF'] = train_df.groupby(['TEAM'])['OEFF'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_OEFF'] = train_df.groupby(['TEAM'])['OEFF'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_OEFF'] = train_df.groupby(['TEAM'])['OEFF'].transform('mean')

# Average DEFF
train_df['Avg_3_game_DEFF'] = train_df.groupby(['TEAM'])['DEFF'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_DEFF'] = train_df.groupby(['TEAM'])['DEFF'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_DEFF'] = train_df.groupby(['TEAM'])['DEFF'].transform('mean')

train_df['Avg_3_game_OR'] = train_df.groupby(['TEAM'])['OR'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_OR'] = train_df.groupby(['TEAM'])['OR'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_OR'] = train_df.groupby(['TEAM'])['OR'].transform('mean')

train_df['Avg_3_game_3P'] = train_df.groupby(['TEAM'])['3P'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_3P'] = train_df.groupby(['TEAM'])['3P'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_3P'] = train_df.groupby(['TEAM'])['3P'].transform('mean')

train_df['Avg_3_game_3PA'] = train_df.groupby(['TEAM'])['3PA'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_3PA'] = train_df.groupby(['TEAM'])['3PA'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_3PA'] = train_df.groupby(['TEAM'])['3PA'].transform('mean')

train_df['Avg_3_game_TO'] = train_df.groupby(['TEAM'])['TO'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_TO'] = train_df.groupby(['TEAM'])['TO'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_TO'] = train_df.groupby(['TEAM'])['TO'].transform('mean')

train_df['Avg_3_game_FT'] = train_df.groupby(['TEAM'])['FT'].transform(lambda x: x.shift(1).rolling(3).mean())
train_df['Avg_5_game_FT'] = train_df.groupby(['TEAM'])['FT'].transform(lambda x: x.shift(1).rolling(5).mean())
train_df['Season_Avg_FT'] = train_df.groupby(['TEAM'])['FT'].transform('mean')

# Apply the function to each game group
train_df = train_df.groupby('GAME-ID').apply(assign_results)

# Reset index
train_df.reset_index(drop=True, inplace=True)


# Shift the Result column for streak calculation
#train_df['Prev_Result'] = train_df.groupby(['TEAM'])['Result'].shift()

# Calculate Streaks
#train_df = train_df.groupby(['TEAM']).apply(calculate_streak)
#display(train_df)
#train_df = train_df.reset_index(drop=True)

train_df['MONEYLINE'] = train_df['CLOSINGMONEYLINE']
# Last 3 Games Moneylines
train_df['Last_ML_1'] = train_df.groupby(['TEAM'])['MONEYLINE'].shift(1)
train_df['Last_ML_2'] = train_df.groupby(['TEAM'])['MONEYLINE'].shift(2)
train_df['Last_ML_3'] = train_df.groupby(['TEAM'])['MONEYLINE'].shift(3)

# Current Number of Wins - Losses
train_df['Wins'] = train_df.groupby(['TEAM'])['ml_result'].cumsum()
train_df['Losses'] = train_df.groupby(['TEAM'])['ml_result'].transform('count') - train_df['Wins']
train_df['Win_Loss_Diff'] = train_df['Wins'] - train_df['Losses']
#display(train_df)
# Current Opponent
train_df['Opponent'] = train_df.groupby('GAME-ID')['TEAM'].shift(-1).fillna(train_df.groupby('GAME-ID')['TEAM'].shift())
#display(train_df)

# Merge Ref Data
#train_df = train_df.merge(refs.groupby('REFEREE').first(), how='left', left_on='MAIN REF', right_on='REFEREE')

# sort by date
train_df = train_df.sort_values('DATE')





# Clean up
#train_df.drop('Prev_Result', axis=1, inplace=True)

In [24]:
train_df.reset_index(drop=True, inplace=True)
train_df = train_df.sort_values('DATE')
train_df['actual_points_allowed'] = train_df.groupby('GAME-ID')['F'].transform(lambda x: x[::-1].values)

In [25]:

# Apply the Elo / Momentum function
train_df[['Elo_Rating', 'Elo_Var', 'Momentum', 'Opp_Elo', 'Opp_Elo_Var', 'Opp_Momentum']] = train_df.apply(update_bayesian_elo_momentum, axis=1, result_type='expand')
train_df.reset_index(drop=True, inplace=True)
train_df = train_df.groupby('GAME-ID').apply(assign_opp_elo)
team_ratings = {}
team_momentum = {}
train_df.reset_index(drop=True, inplace=True)
# Apply the Elo / Momentum function
# Apply the function and expand the returned dictionaries into columns
expanded_cols = train_df.apply(update_bayesian_off_def, axis=1).apply(pd.Series)
# Assign specific keys (including offense, defense, and their variances) to the appropriate columns
train_df['Offensive_Rating'] = expanded_cols.apply(lambda x: x[0]['mu'], axis=1)
train_df['Offensive_Var'] = expanded_cols.apply(lambda x: x[0]['sigma2'], axis=1)
train_df['Defensive_Rating'] = expanded_cols.apply(lambda x: x[1]['mu'], axis=1)
train_df['Defensive_Var'] = expanded_cols.apply(lambda x: x[1]['sigma2'], axis=1)

train_df['Momentum'] = expanded_cols.apply(lambda x: x[2], axis=1)

train_df['Opp_Offensive_Rating'] = expanded_cols.apply(lambda x: x[3]['mu'], axis=1)
train_df['Opp_Offensive_Var'] = expanded_cols.apply(lambda x: x[3]['sigma2'], axis=1)
train_df['Opp_Defensive_Rating'] = expanded_cols.apply(lambda x: x[4]['mu'], axis=1)
train_df['Opp_Defensive_Var'] = expanded_cols.apply(lambda x: x[4]['sigma2'], axis=1)
train_df['Opp_Momentum'] = expanded_cols.apply(lambda x: x[5], axis=1)


In [None]:
train_df.tail(20)[['TEAM', 'Opponent', 'Elo_Rating', 'Offensive_Rating', 'Defensive_Rating', 'Momentum', 'Opp_Offensive_Rating', 'Opp_Defensive_Rating', 'Opp_Momentum']]

Unnamed: 0,TEAM,Opponent,Elo_Rating,Offensive_Rating,Defensive_Rating,Momentum,Opp_Offensive_Rating,Opp_Defensive_Rating,Opp_Momentum
16747,Washington Huskies,Seattle Pacific Falcons,1515.986244,11.581177,-12.93536,1811.067122,689.338203,692.599409,1659.155019
16748,Providence Friars,Oklahoma Sooners,1508.978667,92.10189,75.693389,129.015878,74.900436,40.689936,189.357021
16749,Oklahoma Sooners,Providence Friars,1513.927846,74.900436,40.689936,177.895498,92.10189,75.693389,123.588469
16750,UTSA Roadrunners,North Dakota Fighting Hawks,1502.263158,97.994693,97.32861,573.444093,39.609851,49.766703,1726.668488
16751,North Dakota Fighting Hawks,UTSA Roadrunners,1495.05178,39.609851,49.766703,1561.132934,97.994693,97.32861,523.230978


In [38]:
X = train_df.copy()

def offset_cols(group):
    cols = ['Elo_Rating', 'Momentum']
    group['prev_elo'] = group['Elo_Rating'].shift()
    group['prev_mom'] = group['Momentum'].shift()

    return group

def apply_prev_elo(group):
    cols = ['Elo_Rating', 'Momentum']
    group[cols] = group[cols].shift()

    return group

def assign_opp_elo_mom(group):
    group['Opp_Elo'] = [group.iloc[1]['Elo_Rating'], group.iloc[0]['Elo_Rating']]
    group['Opp_Momentum'] = [group.iloc[1]['Momentum'], group.iloc[0]['Momentum']]
    return group



X = X.groupby(['TEAM']).apply(offset_cols)
X = X.reset_index(drop=True)

X = X.groupby(['TEAM']).apply(apply_prev_elo)
X = X.reset_index(drop=True)

X = X.groupby(['GAME-ID']).apply(assign_opp_elo_mom)
X = X.reset_index(drop=True)

y = X['spread_result']
X = X[TRAIN_COLS + ['spread_result', 'ml_result', 'total_result', 'DATE', 'PTS', 'POSS', 'OEFF', 'DEFF', 'PACE', 'GAME-ID', 'Elo_Var','Opp_Elo_Var']]
X[['MONEYLINE', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3']] = X[['MONEYLINE', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3',]].replace('even', '-100', regex=True)

# reaplace non-numeric characters before converting to int
X['MONEYLINE'] = pd.to_numeric(X['MONEYLINE'].str.replace(r'\D', '').fillna(0), errors='coerce')
X['Last_ML_1'] = pd.to_numeric(X['Last_ML_1'].str.replace(r'\D', '').fillna(0), errors='coerce')
X['Last_ML_2'] = pd.to_numeric(X['Last_ML_2'].str.replace(r'\D', '').fillna(0), errors='coerce')
X['Last_ML_3'] = pd.to_numeric(X['Last_ML_3'].str.replace(r'\D', '').fillna(0), errors='coerce')
# X[['MONEYLINE', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3',]] = pd.to_numeric(X[['MONEYLINE', 'Last_ML_1', 'Last_ML_2', 'Last_ML_3',]].fillna(0), errors='coerce')
#X['MAIN REF'] = X['MAIN REF'].astype('category')
#X['CREW'] = X['CREW'].astype('category')
#X['TEAM_REST_DAYS'] = X['TEAM_REST_DAYS'].astype('category')
X['TEAM'] = X['TEAM'].astype('category')
X['Opponent'] = X['Opponent'].astype('category')
X['VENUE'] = (X['VENUE'] == 'H')*1

# get current date
today = datetime.now().strftime('%Y-%m-%d')

X.to_csv(f'2024_2025_ncaa_team_full_{today}.csv', index=False)