In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import resample
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from ccb_model import BootstrapCalibratedClassifier

# =============================================
# Imports and Initial Setup
# =============================================
# =============================================
# Imports and Initial Setup
# =============================================

import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from itertools import combinations, product

# For RL agent and environment
import gymnasium as gym
from gymnasium import spaces
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableMultiInputActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.utils import set_random_seed
import tensorboard
# List of columns where NaNs are allowed
allowed_nan_columns = [
    'TOTAL_LINE_MOVEMENT_3',
    'SPREAD_LINE_MOVEMENT_3',
    'SPREAD_LINE_MOVEMENT_1',
    'TOTAL_LINE_MOVEMENT_2',
    'SPREAD_LINE_MOVEMENT_2',
    'TOTAL_LINE_MOVEMENT_1',
    'HOME TEAM WIN%',
    'CREW',
    'FOUL DIFFERENTIAL (Against Road Team) - (Against Home Team)',
    'HOME TEAM POINTS DIFFERENTIAL',
    'MAIN REF',
    'FOUL% AGAINST HOME TEAMS',
    'TOTAL POINTS PER GAME',
    'CALLED FOULS PER GAME',
    'FOUL% AGAINST ROAD TEAMS'
]


SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x20563c8ee90>

### implementation

#### helpers

In [3]:
def calculate_profit(odds, size):
    if odds > 0:
        profit = (odds / 100) * size
    else:
        profit = (100 / -(odds + 0.0000001)) * size
    return profit

def kelly_criterion(bankroll, probability, odds, temper=1):
    """
    Calculate the optimal bet size using the Kelly Criterion.

    :param bankroll: Total amount of money you have to bet with.
    :param probability: The probability of the bet winning (from 0 to 1).
    :param odds: The odds being offered on the bet (in decimal format).
    :return: The recommended bet size according to the Kelly Criterion.
    """
    # Convert American odds to decimal if necessary
    if odds > 0:
        odds = (odds / 100) + 1
    elif odds < 0:
        odds = (100 / -odds) + 1

    # Calculate the Kelly bet fraction
    b = odds - 1  # Decimal odds minus 1
    q = 1 - probability  # Probability of losing
    kelly_fraction = (b * probability - q) / b

    # Calculate the recommended bet
    recommended_bet = (temper * kelly_fraction) * bankroll

    return recommended_bet

### load data

In [4]:
# get today's date
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')
yesterday = (datetime.today() - pd.DateOffset(1)).strftime('%Y-%m-%d')
two_days_ago = (datetime.today() - pd.DateOffset(2)).strftime('%Y-%m-%d')
three_days_ago = (datetime.today() - pd.DateOffset(3)).strftime('%Y-%m-%d')
df = pd.read_csv(f'2024_2025_nba_team_full_{three_days_ago}.csv')

# Dropping rows with NaN values, except in specified columns
#df = df.dropna(subset=[col for col in df.columns if col not in allowed_nan_columns])

# flip true and false in ml_result column
df['ml_result'] = df['ml_result'].apply(lambda x: True if x == False else False)

# convert categorical columns
df['MAIN REF'] = df['MAIN REF'].astype('category')
df['CREW'] = df['CREW'].astype('category')
df['TEAM_REST_DAYS'] = df['TEAM_REST_DAYS'].astype('category')
df['TEAM'] = df['TEAM'].astype('category')
df['Opponent'] = df['Opponent'].astype('category')

# convert venue to binary
df['VENUE'] = (df['VENUE'] == 'H')*1

# convert date to datetime
df['DATE'] = pd.to_datetime(df['DATE'])

### train / test / val split

In [5]:
X = df.drop(columns=['spread_result', 'ml_result', 'total_result', 'q3_result', 'DATE', 'POSS', 'OEFF', 'DEFF', 'PACE', 'PTS'])
X_rl = df.drop(columns=['q3_result', 'POSS', 'OEFF', 'DEFF', 'PACE', 'PTS'])
y_ml = df['ml_result']
y_spread = df['spread_result']
y_q3 = df['q3_result']
y_total = df['total_result']

X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X, y_ml, test_size=0.2, random_state=42)
X_train_ml, X_val_ml, y_train_ml, y_val_ml = train_test_split(X_train_ml, y_train_ml, test_size=0.1, random_state=41)

X_train_spread, X_test_spread, y_train_spread, y_test_spread = train_test_split(X, y_spread, test_size=0.2, random_state=42)
X_train_spread, X_val_spread, y_train_spread, y_val_spread = train_test_split(X_train_spread, y_train_spread, test_size=0.1, random_state=41)

X_train_q3, X_test_q3, y_train_q3, y_test_q3 = train_test_split(X, y_q3, test_size=0.2, random_state=42)
X_train_q3, X_val_q3, y_train_q3, y_val_q3 = train_test_split(X_train_q3, y_train_q3, test_size=0.1, random_state=41)

X_train_total, X_test_total, y_train_total, y_test_total = train_test_split(X, y_total, test_size=0.2, random_state=42)
X_train_total, X_val_total, y_train_total, y_val_total = train_test_split(X_train_total, y_train_total, test_size=0.1, random_state=41)

### train

In [6]:
spread_model = BootstrapCalibratedClassifier(n_bootstrap_samples=5)
spread_model.load_model('spread_model')

ml_model = BootstrapCalibratedClassifier(n_bootstrap_samples=5)
ml_model.load_model('ml_model')

total_model = BootstrapCalibratedClassifier(n_bootstrap_samples=5)
total_model.load_model('total_model')


Models loaded from spread_model
Models loaded from ml_model
Models loaded from total_model


### backtesting

In [7]:
drop_cols = ['spread_result', 'ml_result', 'total_result', 'GAME-ID', 'DATE']

In [8]:
def odds_to_implied_prob(odds):
    # Convert American odds to implied probability
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

In [9]:
# create a column in X_train_ml that is the predicted probability of the model
df_rl = X_rl = df.drop(columns=['q3_result', 'POSS', 'OEFF', 'DEFF', 'PACE', 'PTS']).copy(deep=True)
ml_preds = ml_model.predict_proba_raw(df_rl.drop(drop_cols, axis=1))[:,:,1].T
spread_preds = spread_model.predict_proba_raw(df_rl.drop(drop_cols, axis=1))[:,:,1].T
total_preds = total_model.predict_proba_raw(df_rl.drop(drop_cols, axis=1))[:,:,1].T
# X_rl['spread_prob'] = spread_model.predict_proba(df_rl.drop(drop_cols, axis=1))[:, 1]
# X_rl['total_prob'] = total_model.predict_proba(df_rl.drop(drop_cols, axis=1))[:, 1]
X_rl[['ml_prob1', 'ml_prob2','ml_prob3','ml_prob4','ml_prob5',]] = ml_preds
X_rl[['spread_prob1', 'spread_prob2','spread_prob3','spread_prob4','spread_prob5',]] = spread_preds
X_rl[['total_prob1', 'total_prob2','total_prob3','total_prob4','total_prob5',]] = total_preds

# add a mean and variance column for each model
X_rl['ml_mean'] = X_rl[['ml_prob1', 'ml_prob2','ml_prob3','ml_prob4','ml_prob5',]].mean(axis=1)
X_rl['ml_var'] = X_rl[['ml_prob1', 'ml_prob2','ml_prob3','ml_prob4','ml_prob5',]].var(axis=1)
X_rl['spread_mean'] = X_rl[['spread_prob1', 'spread_prob2','spread_prob3','spread_prob4','spread_prob5',]].mean(axis=1)
X_rl['spread_var'] = X_rl[['spread_prob1', 'spread_prob2','spread_prob3','spread_prob4','spread_prob5',]].var(axis=1)
X_rl['total_mean'] = X_rl[['total_prob1', 'total_prob2','total_prob3','total_prob4','total_prob5',]].mean(axis=1)
X_rl['total_var'] = X_rl[['total_prob1', 'total_prob2','total_prob3','total_prob4','total_prob5',]].var(axis=1)

# create implied odds columns
X_rl['SPREAD_LINE'] = -110
X_rl['TOTAL_LINE'] = -110
X_rl['implied_ml_odds'] = X_rl['MONEYLINE'].apply(odds_to_implied_prob)
X_rl['implied_spread_odds'] = X_rl['SPREAD_LINE'].apply(odds_to_implied_prob)
X_rl['implied_total_odds'] = X_rl['TOTAL_LINE'].apply(odds_to_implied_prob)

# create a value column as the difference between the implied odds and the model odds mean
X_rl['ml_value'] = X_rl['ml_mean'] - X_rl['implied_ml_odds']
X_rl['spread_value'] = X_rl['spread_mean'] - X_rl['implied_spread_odds']
X_rl['total_value'] = X_rl['total_mean'] - X_rl['implied_total_odds']

# flip true and false in ml_result column
X_rl['ml_result'] = X_rl['ml_result'].apply(lambda x: True if x == False else False)


In [10]:
train = X_rl[['GAME-ID', 'DATE', 'TEAM', 'Opponent',
               'ml_mean', 'ml_var', 'MONEYLINE', 'ml_result',
                'spread_mean','spread_var', 'SPREAD_LINE','spread_result',
                'implied_ml_odds', 'implied_spread_odds','ml_value', 'spread_value', 
                'ml_prob1', 'ml_prob2', 'ml_prob3', 'ml_prob4', 'ml_prob5',
                'spread_prob1', 'spread_prob2', 'spread_prob3', 'spread_prob4', 'spread_prob5']].copy(deep=True)

### Single bet env

In [None]:
import pandas as pd
import numpy as np
import random
import logging
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt  # Import for plotting
import wandb  # Import wandb

# Initialize wandb
wandb.init(project='parlays_dqn_agent')

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # Adjust layer sizes as needed
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def odds_to_implied_prob(odds):
    # Convert American odds to implied probability
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

def get_state(games_data, bankroll, max_games=15):
    # Initialize state with zeros and pad for games < max_games
    state = np.zeros((max_games, 2, 20))  # [Games, Teams, Features]
    
    game_ids = games_data['GAME-ID'].unique()
    for i, game_id in enumerate(game_ids):
        if i >= max_games:
            break  # Only consider up to max_games
        
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        
        for j, team in enumerate(teams):
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            # Predicted probabilities
            ml_prob = team_data['ml_mean']
            spread_prob = team_data['spread_mean']
            # Offered odds
            ml_odds = team_data['MONEYLINE']
            spread_odds = team_data['SPREAD_LINE']
            # Other features
            prob1 = team_data['ml_prob1']
            prob2 = team_data['ml_prob2']
            prob3 = team_data['ml_prob3']
            prob4 = team_data['ml_prob4']
            prob5 = team_data['ml_prob5']

            spread_prob1 = team_data['spread_prob1']
            spread_prob2 = team_data['spread_prob2']
            spread_prob3 = team_data['spread_prob3']
            spread_prob4 = team_data['spread_prob4']
            spread_prob5 = team_data['spread_prob5']

            implied_ml_odds = team_data['implied_ml_odds']
            implied_spread_odds = team_data['implied_spread_odds']
            ml_value = team_data['ml_value']
            spread_value = team_data['spread_value']

            ml_var = team_data['ml_var']
            spread_var = team_data['spread_var']
            
            # Implied probabilities
            ml_imp_prob = odds_to_implied_prob(ml_odds)
            spread_imp_prob = odds_to_implied_prob(spread_odds)
            # Value indicators
            ml_value = ml_prob - ml_imp_prob
            spread_value = spread_prob - spread_imp_prob
            # Assign features
            state[i, j, :] = [ml_prob, ml_odds, ml_value, spread_prob, spread_odds, spread_value, prob1, prob2, prob3, prob4, prob5,
                              spread_prob1, spread_prob2, spread_prob3, spread_prob4, spread_prob5,
                              implied_ml_odds, implied_spread_odds, ml_var, spread_var]
    
    state = state.flatten()
    state = np.append(state, bankroll / 10000)  # Normalize bankroll
    return state

def generate_actions(games_data, stake_sizes=[0.01, 0.02, 0.03, 0.04, 0.05]):
    actions = []
    game_ids = games_data['GAME-ID'].unique()
    for i, game_id in enumerate(game_ids):
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        for team in teams:
            for market in ['ML', 'Spread']:
                for stake in stake_sizes:
                    action = {
                        'game_index': i,
                        'team': team,
                        'market': market,
                        'stake': stake
                    }
                    actions.append(action)
    return actions

class ParlaysDQNAgent:
    def __init__(self, state_size, action_size, batch_size=64, gamma=0.99,
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995,
                 learning_rate=0.001, target_update=10, memory_size=10000,
                 logging_level=logging.INFO):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon_start  # Exploration rate
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.target_update = target_update  # Episodes between target network updates
        self.memory = deque(maxlen=memory_size)
        self.bankroll = 10000  # Starting bankroll
        
        # Neural networks
        self.policy_net = DQN(state_size, action_size)
        self.target_net = DQN(state_size, action_size)
        self.update_target_net()
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()
        
        # Set up logging
        self.logger = logging.getLogger('ParlaysDQNAgent')
        self.logger.setLevel(logging_level)
        handler = logging.StreamHandler()
        handler.setLevel(logging_level)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)
        self.logger.propagate = False  # Prevent duplicate logs
        
    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:
            # Exploration: Choose a random action from available actions
            action_idx = random.choice(available_actions)
        else:
            # Exploitation: Choose the best action based on policy_net predictions
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_values = self.policy_net(state_tensor)
            q_values = q_values.detach().numpy()[0]
            # Mask unavailable actions
            masked_q_values = np.full(self.action_size, -np.inf)
            masked_q_values[available_actions] = q_values[available_actions]
            action_idx = np.argmax(masked_q_values)
        return action_idx
    
    def replay(self):
        if len(self.memory) < self.batch_size:
            return  # Not enough samples to train
        
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions).unsqueeze(1)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.BoolTensor(dones)
        
        # Current Q values
        q_values = self.policy_net(states).gather(1, actions).squeeze()
        
        # Target Q values
        with torch.no_grad():
            next_q_values = self.target_net(next_states).max(1)[0]
        target_q_values = rewards + self.gamma * next_q_values * (~dones)
        
        # Loss
        loss = self.criterion(q_values, target_q_values)
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Log training loss to wandb
        wandb.log({'training_loss': loss.item()})
        
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        
    def save_model(self, filepath):
        torch.save(self.policy_net.state_dict(), filepath)
        
    def load_model(self, filepath):
        self.policy_net.load_state_dict(torch.load(filepath))
        self.update_target_net()
        
    def american_to_decimal_odds(self, odds):
        # Convert American odds to decimal odds
        if odds > 0:
            return (odds / 100) + 1
        else:
            return (100 / abs(odds)) + 1

def calculate_reward(bet_result):
    # Reward is solely based on actual profit or loss
    reward = bet_result
    return reward

def train_agent(agent, df, episodes=1000, max_days_per_episode=None):
    grouped = df.groupby('DATE')
    dates = list(grouped.groups.keys())
    max_games = 15
    
    bankroll_history = []  # List to store bankroll at the end of each episode
    
    # Log hyperparameters to wandb
    wandb.config.update({
        'episodes': episodes,
        'max_days_per_episode': max_days_per_episode,
        'batch_size': agent.batch_size,
        'gamma': agent.gamma,
        'epsilon_start': agent.epsilon,
        'epsilon_end': agent.epsilon_min,
        'epsilon_decay': agent.epsilon_decay,
        'learning_rate': agent.learning_rate,
        'target_update': agent.target_update,
        'memory_size': len(agent.memory),
    })
    
    for episode in range(episodes):
        agent.logger.info(f"Starting episode {episode + 1}/{episodes} with epsilon: {agent.epsilon:.4f}")
        if max_days_per_episode:
            selected_dates = random.sample(dates, min(max_days_per_episode, len(dates)))
        else:
            selected_dates = dates  # Use all dates
        
        # Reset bankroll at the start of each episode
        agent.bankroll = 10000
        
        total_episode_reward = 0  # Track total reward for the episode
        
        for date in selected_dates:
            games_data = grouped.get_group(date)
            unique_game_ids = games_data['GAME-ID'].unique()
            if len(unique_game_ids) < 1:
                continue  # Skip if no games
            
            state = get_state(games_data, agent.bankroll, max_games)
            actions = generate_actions(games_data)
            action_size = len(actions)
            available_actions = list(range(action_size))
            
            # Take action
            action_idx = agent.act(state, available_actions)
            action = actions[action_idx]
            stake_amount = action['stake'] * agent.bankroll
            team = action['team']
            market = action['market']
            game_index = action['game_index']
            
            # Get game data
            game_ids = games_data['GAME-ID'].unique()
            game_id = game_ids[game_index]
            game_df = games_data[games_data['GAME-ID'] == game_id]
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            
            # Calculate expected value
            if market == 'ML':
                predicted_prob = team_data['ml_mean']
                odds = team_data['MONEYLINE']
                result = team_data['ml_result']
            else:
                predicted_prob = team_data['spread_mean']
                odds = team_data['SPREAD_LINE']
                result = team_data['spread_result']
            
            implied_prob = odds_to_implied_prob(odds)
            value = predicted_prob - implied_prob
            expected_value = value  # Expected value per unit stake
            
            # Calculate bet result
            if result:
                decimal_odds = agent.american_to_decimal_odds(odds)
                bet_result = stake_amount * (decimal_odds - 1)
            else:
                bet_result = -stake_amount
            
            # Update bankroll
            agent.bankroll += bet_result
            
            # Next state
            next_state = get_state(games_data, agent.bankroll, max_games)
            done = False  # In this simplified example, episodes don't have a terminal state
            
            # Calculate reward
            reward = calculate_reward(bet_result)
            total_episode_reward += reward  # Accumulate reward
            
            # Remember and train
            agent.remember(state, action_idx, reward, next_state, done)
            agent.replay()
            
            state = next_state  # Update state for next step
            
            # Log per-action metrics to wandb
            wandb.log({
                'date': date,
                'episode': episode + 1,
                'epsilon': agent.epsilon,
                'bankroll': agent.bankroll,
                'bet_result': bet_result,
                'stake_amount': stake_amount,
                'action_idx': action_idx,
                'reward': reward,
            })
            
        # Decay epsilon
        agent.decay_epsilon()
        
        # Update target network
        if episode % agent.target_update == 0:
            agent.update_target_net()
        
        agent.logger.info(f"Ending episode {episode + 1}/{episodes} with bankroll: {agent.bankroll}\n")
        
        bankroll_history.append(agent.bankroll)  # Record bankroll at the end of the episode
        
        # Log per-episode metrics to wandb
        wandb.log({
            'episode': episode + 1,
            'epsilon': agent.epsilon,
            'episode_reward': total_episode_reward,
            'bankroll': agent.bankroll,
        })
        
    # Save the trained model
    agent.save_model('parlays_dqn_model.pth')
    
    return bankroll_history  # Return the bankroll history for analysis


# Prepare agent
state_size = (15 * 2 * 20) + 1  # 15 games, 2 teams, 4 features per team, plus bankroll
max_games = 15

# Generate actions for all possible games to get consistent action size
actions = []
for _ in range(max_games):
    for team in ['TeamA', 'TeamB']:
        for market in ['ML', 'Spread']:
            for stake in [0.01, 0.02, 0.03, 0.04, 0.05]:
                action = {
                    'game_index': _,
                    'team': team,
                    'market': market,
                    'stake': stake
                }
                actions.append(action)
action_size = len(actions)

agent = ParlaysDQNAgent(
    state_size=state_size,
    action_size=action_size,
    batch_size=64,
    gamma=0.99,
    epsilon_start=1,
    epsilon_end=0.01,
    epsilon_decay=0.995,
    learning_rate=0.001,
    target_update=10,
    memory_size=10000,
    logging_level=logging.INFO
)

# Train agent and get bankroll history
bankroll_history = train_agent(agent, train, episodes=500)

# Initial and final bankrolls
initial_bankroll = 10000  # Starting bankroll
final_bankroll = bankroll_history[-1]
total_profit = final_bankroll - initial_bankroll
roi = (total_profit / initial_bankroll) * 100

print(f"Total Profit: ${total_profit:.2f}")
print(f"ROI: {roi:.2f}%")

# Plot bankroll over episodes
episodes = range(1, len(bankroll_history) + 1)
plt.figure(figsize=(10, 6))
plt.plot(episodes, bankroll_history, marker='o')
plt.title('Agent Bankroll Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Bankroll')
plt.grid(True)
plt.show()

# Finish the wandb run
wandb.finish()


### parlay env

In [29]:


# =============================================
# Data Preparation Functions
# =============================================

def odds_to_implied_prob(odds):
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

def get_state(games_data, agent_context, max_games=15):
    state_sequence = []
    game_ids = games_data['GAME-ID'].unique()
    for idx, game_id in enumerate(game_ids):
        if idx >= max_games:
            break
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        if len(teams) < 2:
            continue
        team_features = []
        for team in teams:
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            features = [
                team_data['spread_mean'],
                team_data['SPREAD_LINE'],
                team_data['spread_value'],
                team_data['spread_prob1'],
                team_data['spread_prob2'],
                team_data['spread_prob3'],
                team_data['spread_prob4'],
                team_data['spread_prob5'],
                team_data['implied_spread_odds'],
                team_data['spread_var']
            ]
            team_features.append(features)
        if len(team_features) == 2:
            state_sequence.append(team_features)
    agent_features = np.array([
        agent_context['bankroll'] / agent_context['initial_bankroll'],
        #agent_context['parlays_placed'] / agent_context['max_parlays']
    ])
    return state_sequence, agent_features

def generate_valid_parlays(games_data, parlay_size):
    game_picks = []
    game_ids = games_data['GAME-ID'].unique()
    for idx, game_id in enumerate(game_ids):
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        if len(teams) < 2:
            continue
        team_a = teams[0]
        team_b = teams[1]
        picks = [
            {'game_index': idx, 'game_id': game_id, 'team': team_a},
            {'game_index': idx, 'game_id': game_id, 'team': team_b}
        ]
        game_picks.append(picks)
    game_combinations = combinations(game_picks, parlay_size)
    all_parlays = []
    for game_combo in game_combinations:
        parlay_combos = list(product(*game_combo))
        all_parlays.extend(parlay_combos)
    return all_parlays


# =============================================
# Action Masking Function
# =============================================

def get_action_mask(env):
    return env.action_mask

#### single bet env

In [37]:


# Set a seed for reproducibility
SEED = 42

def get_state(games_data, agent_context, max_games=15):
    state_sequence = []
    game_ids = games_data['GAME-ID'].unique()
    for idx, game_id in enumerate(game_ids):
        if idx >= max_games:
            break
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        if len(teams) < 2:
            continue
        team_features = []
        for team in teams:
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            features = [
                team_data['spread_mean'],
                team_data['SPREAD_LINE'],
                team_data['spread_value'],
                team_data['spread_prob1'],
                team_data['spread_prob2'],
                team_data['spread_prob3'],
                team_data['spread_prob4'],
                team_data['spread_prob5'],
                team_data['implied_spread_odds'],
                team_data['spread_var']
            ]
            team_features.append(features)
        if len(team_features) == 2:
            state_sequence.append(team_features)
    agent_features = np.array([
        agent_context['bankroll'] / agent_context['initial_bankroll'],
    ])
    return state_sequence, agent_features

class BettingEnv(gym.Env):
    metadata = {'render_modes': ['human']}
    
    def __init__(self, data, max_games=6, stake_per_bet=100, initial_bankroll=10000):
        super(BettingEnv, self).__init__()
        self.data = data
        self.dates = self.data['DATE'].unique()
        self.max_games = max_games
        self.stake_per_bet = stake_per_bet
        self.initial_bankroll = initial_bankroll
        
        # Number of features per team (from get_state function)
        self.num_features_per_team = 5  # As per the get_state function
        self.agent_feature_dim = 1  # Only bankroll ratio as agent feature
        
        # Define action space: Decide whether to bet on each game (binary decision for each game)
        self.action_space = spaces.MultiBinary(self.max_games)
        
        # Define observation space
        self.observation_space = spaces.Dict({
            'state_sequence': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(self.max_games, 2, self.num_features_per_team),
                dtype=np.float32
            ),
            'agent_features': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(self.agent_feature_dim,),  # Adjust if agent_features has more dimensions
                dtype=np.float32
            ),
            'action_mask': spaces.MultiBinary(self.max_games)
        })
        
        # Initialize state variables
        self.current_date_idx = -1
        self.current_date = None
        self.games_data = None
        self.agent_context = None
        self.bets_placed = None
        self.total_stake = 0
        self.done = False
        
        self.seed(SEED)
        self.reset()
    
    def reset(self, seed=None, options=None):
        if seed is not None:
            self.seed(seed)
        
        # Move to the next date
        self.current_date_idx = (self.current_date_idx + 1) % len(self.dates)
        self.current_date = self.dates[self.current_date_idx]
        
        # Get games for the current date
        self.games_data = self.data[self.data['DATE'] == self.current_date].reset_index(drop=True)
        
        # Convert 'GAME-ID's to strings
        self.games_data['GAME-ID'] = self.games_data['GAME-ID'].astype(str)
        
        # Initialize agent context
        self.agent_context = {
            'bankroll': self.initial_bankroll,
            'initial_bankroll': self.initial_bankroll,
        }
        
        self.bets_placed = []
        self.total_stake = 0
        self.done = False
        
        # Prepare state_sequence and agent_features using get_state
        self.state_sequence, self.agent_features = get_state(self.games_data, self.agent_context, max_games=self.max_games)
        
        # Pad state_sequence if less than max_games
        num_games = len(self.state_sequence)
        if num_games < self.max_games:
            padding = [[[0]*self.num_features_per_team for _ in range(2)] for _ in range(self.max_games - num_games)]
            self.state_sequence.extend(padding)
        
        # Prepare action_mask
        self.action_mask = np.ones(self.max_games, dtype=np.int8)
        for idx in range(self.max_games):
            if idx >= num_games:
                self.action_mask[idx] = 0  # No game data for this index
            else:
                # Check bankroll
                if self.agent_context['bankroll'] < self.stake_per_bet:
                    self.action_mask[idx] = 0  # Not enough bankroll to bet
        
        # Prepare observation
        observation = self._get_observation()
        return observation, {}
    
    def step(self, action):
        if self.done:
            raise ValueError("Episode has ended. Please reset the environment.")
        
        # Ensure action is a binary array of length max_games
        if not isinstance(action, np.ndarray):
            action = np.array(action)
        if action.shape != (self.max_games,):
            raise ValueError(f"Action must be of shape ({self.max_games},), got {action.shape}")
        
        # Initialize total profit for this step
        total_profit = 0
        
        # Process each game based on the action
        for idx, bet_decision in enumerate(action):
            if bet_decision == 1 and self.action_mask[idx] == 1:
                if idx >= len(self.games_data):
                    continue  # No game data
                game = self.games_data.iloc[idx]
                # Place the bet
                self.bets_placed.append(game)
                self.agent_context['bankroll'] -= self.stake_per_bet
                self.total_stake += self.stake_per_bet
                
                # Calculate the result of the bet
                profit = self.calculate_bet_reward(game, self.stake_per_bet)
                total_profit += profit
        
        # Update bankroll with profit/loss
        self.agent_context['bankroll'] += total_profit
        
        # Prepare reward
        reward = total_profit
        
        # Prepare next observation
        observation = self._get_observation()
        
        # Set done to True as we simulate one day per episode
        self.done = True
        
        info = {}
        
        return observation, reward, self.done, False, info
    
    def _get_observation(self):
        observation = {
            'state_sequence': np.array(self.state_sequence, dtype=np.float32),
            'agent_features': np.array(self.agent_features, dtype=np.float32),
            'action_mask': self.action_mask
        }
        return observation
    
    def calculate_bet_reward(self, game, stake):
        # Determine the outcome of the bet
        if game['spread_result']:
            odds = game['ODDS']  # Ensure 'ODDS' column exists in your data
            decimal_odds = self.american_to_decimal_odds(odds)
            profit = stake * (decimal_odds - 1)
        else:
            profit = -stake
        return profit
    
    def american_to_decimal_odds(self, odds):
        # Convert American odds to decimal odds
        if odds > 0:
            return (odds / 100) + 1
        else:
            return (100 / abs(odds)) + 1
    
    def render(self, mode='human'):
        if mode == 'human':
            print(f"Date: {self.current_date}")
            print(f"Bankroll: {self.agent_context['bankroll']}")
            print(f"Bets Placed: {len(self.bets_placed)}")
            for bet in self.bets_placed:
                print(f" - Game ID: {bet['GAME-ID']}, Team: {bet['TEAM']}, Spread Result: {bet['spread_result']}")
        else:
            super().render(mode=mode)
    
    def close(self):
        pass
    
    def seed(self, seed=None):
        random.seed(seed)
        np.random.seed(seed)
        # Uncomment the following line if you're using PyTorch
        torch.manual_seed(seed)


In [38]:

# Set a seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Load your dataset
# data = pd.read_csv('your_data.csv')  # Replace with your actual data loading method
# Ensure that 'data' is a pandas DataFrame with necessary columns

# Filter data to have dates with between 4 and 6 games
data = data.groupby('DATE').filter(lambda x: 4 <= len(x) <= 6).reset_index(drop=True)

# Create the environment
env = BettingEnv(data=data, max_games=6, stake_per_bet=100, initial_bankroll=10000)

# Define the action mask function
def get_action_mask(env):
    return env._get_observation()['action_mask']

# Wrap the environment with ActionMasker
env = ActionMasker(env, get_action_mask)

# Define policy_kwargs
policy_kwargs = dict(
    features_extractor_class=CustomPolicyNetwork,
    features_extractor_kwargs=dict(
        embedding_dim=128,
        num_heads=4,
        num_layers=2,
    ),
)

# Determine the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Training on device: {device}")

# Instantiate the Maskable PPO agent
model = MaskablePPO(
    policy=MaskableMultiInputActorCriticPolicy,
    env=env,
    learning_rate=0.0003,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./ppo_betting_tensorboard/",
    seed=SEED,
    device=device
)

# Train the agent
model.learn(total_timesteps=250000)

# Save the trained model
model.save('ppo_betting_agent')


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (6, 2) + inhomogeneous part.

In [26]:
# Test the environment
obs, _ = env.reset()
action_masks = obs['action_mask']

# Generate a random valid action
valid_actions = np.where(action_masks == 1)[0]
random_action = np.zeros(env.max_games, dtype=int)
if len(valid_actions) > 0:
    random_action[valid_actions] = np.random.randint(0, 2, size=len(valid_actions))

# Step through the environment
new_obs, reward, done, _, info = env.step(random_action)

# Test the model's forward pass
model.policy.to(device)
obs_tensor = model.policy.obs_to_tensor(obs)[0]
with torch.no_grad():
    action, _ = model.policy.predict(obs, deterministic=False)


AttributeError: 'ActionMasker' object has no attribute 'max_games'

#### parlays

In [30]:
# =============================================
# Custom Environment Definition
# =============================================

class BettingEnv(gym.Env):
    metadata = {'render_modes': ['human']}

    def __init__(self, data, max_parlays=1, parlay_size=2, max_games=6):
        super(BettingEnv, self).__init__()
        self.data = data
        self.dates = self.data['DATE'].unique()
        self.max_parlays = max_parlays
        self.parlay_size = parlay_size
        self.max_games = max_games

        # Calculate the maximum possible number of actions
        self.max_actions = self.calculate_max_possible_actions(self.max_games, self.parlay_size)
        self.action_space = spaces.Discrete(self.max_actions)
        self.observation_space = spaces.Dict({
            'state_sequence': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(self.max_games, 2, 10),
                dtype=np.float32
            ),
            'agent_features': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(2,),
                dtype=np.float32
            )
            # Removed 'action_mask' from observation_space
        })
        self.current_date_idx = -1
        self.current_date = None
        self.games_data = None
        self.agent_context = None
        self.state_sequence = None
        self.agent_features = None
        self.parlays = None
        self.action_space_size = None
        self.action_mask = None
        self.parlays_placed = []
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.max_steps = self.max_parlays
        self.seed(SEED)
        self.reset()

    def calculate_max_possible_actions(self, max_games, parlay_size):
        from math import comb
        num_combinations = comb(max_games, parlay_size)
        num_actions = num_combinations * (2 ** parlay_size)
        return num_actions

    def reset(self, seed=None, options=None):
        if seed is not None:
            self.seed(seed)
        self.current_date_idx = (self.current_date_idx + 1) % len(self.dates)
        self.current_date = self.dates[self.current_date_idx]
        self.games_data = self.data[self.data['DATE'] == self.current_date].reset_index(drop=True)
        self.agent_context = {
            'bankroll': 10000,
            'initial_bankroll': 10000,
            'parlays_placed': 0,
            'max_parlays': self.max_parlays
        }
        self.parlays_placed = []
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.state_sequence, self.agent_features = get_state(self.games_data, self.agent_context, self.max_games)
        self.parlays = generate_valid_parlays(self.games_data, self.parlay_size)
        if len(self.parlays) > self.max_actions:
            self.parlays = random.sample(self.parlays, self.max_actions)
        self.action_space_size = len(self.parlays)
        self.action_mask = np.zeros(self.max_actions, dtype=bool)
        self.action_mask[:self.action_space_size] = True
        # Ensure there is at least one valid action
        if not self.action_mask.any():
            # Handle case when there are no valid actions
            raise ValueError("No valid actions available at reset.")
        observation = self._get_observation()
        return observation, {}

    def step(self, action):
        self.step_count += 1
        if 0 <= action < self.action_space_size and self.action_mask[action]:
            selected_parlay = self.parlays[action]
            self.parlays_placed.append(selected_parlay)
            self.agent_context['parlays_placed'] += 1
            stake_per_parlay = 100
            self.total_stake += stake_per_parlay
            self.agent_context['bankroll'] -= stake_per_parlay
            self.agent_features = np.array([
                self.agent_context['bankroll'] / self.agent_context['initial_bankroll'],
                self.agent_context['parlays_placed'] / self.agent_context['max_parlays']
            ])
            self.action_mask[action] = False
            if not self.action_mask.any() and not self.done:
                # Handle the case where no valid actions are left
                self.done = True
                reward = self.compute_reward()
                observation = self._get_observation()
                terminated = self.done
                truncated = False
                return observation, reward, terminated, truncated, {}
            if self.agent_context['parlays_placed'] >= self.max_parlays:
                self.done = True
                reward = self.compute_reward()
            else:
                reward = 0
            observation = self._get_observation()
            terminated = self.done
            truncated = False
            return observation, reward, terminated, truncated, {}
        else:
            reward = -10
            self.done = True
            observation = self._get_observation()
            terminated = self.done
            truncated = False
            return observation, reward, terminated, truncated, {}

    def _get_padded_state_sequence(self):
        state_sequence = self.state_sequence.copy()
        num_games = len(state_sequence)
        if num_games < self.max_games:
            padding = [[[0]*10, [0]*10] for _ in range(self.max_games - num_games)]
            state_sequence.extend(padding)
        return np.array(state_sequence, dtype=np.float32)

    def _get_observation(self):
        observation = {
            'state_sequence': self._get_padded_state_sequence(),
            'agent_features': self.agent_features.astype(np.float32)
            # Removed 'action_mask' from the observation
        }
        return observation

    def render(self):
        pass

    def close(self):
        pass

    def seed(self, seed=None):
        set_random_seed(seed)

    def compute_reward(self):
        parlay_results = []
        for parlay in self.parlays_placed:
            parlay_profit = self.calculate_parlay_reward(parlay, self.games_data, stake=100)
            parlay_results.append(parlay_profit)
        total_profit = sum(parlay_results)
        reward = total_profit
        return reward

    def calculate_parlay_reward(self, parlay, games_data, stake):
        parlay_win = True
        parlay_odds = 1.0
        for pick in parlay:
            game_index = pick['game_index']
            team = pick['team']
            game_id = pick['game_id']
            game_df = games_data[games_data['GAME-ID'] == game_id]
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            result = team_data['spread_result']
            if not result:
                parlay_win = False
                break
            odds = team_data['SPREAD_LINE']
            decimal_odds = self.american_to_decimal_odds(odds)
            parlay_odds *= decimal_odds
        if parlay_win:
            profit = stake * (parlay_odds - 1)
        else:
            profit = -stake
        return profit

    def american_to_decimal_odds(self, odds):
        if odds > 0:
            return (odds / 100) + 1
        else:
            return (100 / abs(odds)) + 1


In [31]:
# =============================================
# Custom Policy Network Definition
# =============================================

class CustomPolicyNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, embedding_dim=128, num_heads=4, num_layers=2):
        self.max_games = observation_space['state_sequence'].shape[0]
        self.output_dim = (self.max_games * embedding_dim) + embedding_dim
        super(CustomPolicyNetwork, self).__init__(observation_space, features_dim=self.output_dim)
        self.team_feature_dim = observation_space['state_sequence'].shape[2]
        self.agent_feature_dim = observation_space['agent_features'].shape[0]
        self.team_embedding = nn.Linear(self.team_feature_dim, embedding_dim)
        self.cross_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.agent_embedding = nn.Linear(self.agent_feature_dim, embedding_dim)

    def forward(self, observations):
        state_sequence = observations['state_sequence']
        agent_features = observations['agent_features']
        if isinstance(state_sequence, np.ndarray):
            state_sequence = torch.tensor(state_sequence, dtype=torch.float32)
        if isinstance(agent_features, np.ndarray):
            agent_features = torch.tensor(agent_features, dtype=torch.float32)
        # Ensure batch dimension
        if state_sequence.dim() == 3:
            state_sequence = state_sequence.unsqueeze(0)
            agent_features = agent_features.unsqueeze(0)
        batch_size = state_sequence.shape[0]
        max_games = state_sequence.shape[1]
        embedding_dim = self.team_embedding.out_features
        game_embeddings = []
        for i in range(max_games):
            team_a_features = state_sequence[:, i, 0, :]
            team_b_features = state_sequence[:, i, 1, :]
            team_a_embedding = self.team_embedding(team_a_features)
            team_b_embedding = self.team_embedding(team_b_features)
            team_a_embedding = team_a_embedding.unsqueeze(0)
            team_b_embedding = team_b_embedding.unsqueeze(0)
            attn_output_a, _ = self.cross_attention(team_a_embedding, team_b_embedding, team_b_embedding)
            attn_output_b, _ = self.cross_attention(team_b_embedding, team_a_embedding, team_a_embedding)
            combined_embedding = (attn_output_a + attn_output_b) / 2
            game_embeddings.append(combined_embedding)
        if game_embeddings:
            game_embeddings = torch.cat(game_embeddings, dim=0)
        else:
            game_embeddings = torch.zeros(1, batch_size, embedding_dim).to(agent_features.device)
        transformer_output = self.transformer_encoder(game_embeddings)
        transformer_output = transformer_output.permute(1, 0, 2).reshape(batch_size, -1)
        agent_embedding = self.agent_embedding(agent_features)
        combined_input = torch.cat([transformer_output, agent_embedding], dim=1)
        return combined_input

In [32]:
# Load your dataset
data = train.groupby('DATE').filter(lambda x: (len(x) >= 4) and (len(x) <= 6))  # Replace with your data file

# Create the environment
env = BettingEnv(data=data, max_parlays=2, parlay_size=2, max_games=6)

# Wrap the environment with ActionMasker
env = ActionMasker(env, get_action_mask)
# Determine the device: 'cuda' if available, else 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Training on device: {device}")
# Define the policy_kwargs with your custom policy network
policy_kwargs = dict(
    features_extractor_class=CustomPolicyNetwork,
    features_extractor_kwargs=dict(
        embedding_dim=128,
        num_heads=4,
        num_layers=2,
    ),
)

# Instantiate the Maskable PPO agent
model = MaskablePPO(
    policy=MaskableMultiInputActorCriticPolicy,
    env=env,
    learning_rate=0.0003,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./ppo_betting_tensorboard/",
    seed=SEED,
    device=device  # Specify the device here
)

# Train the agent
model.learn(total_timesteps=250000)

# Save the trained model
model.save('ppo_betting_agent')



<class '__main__.BettingEnv'>
Training on device: cuda
Using cuda device
Wrapping the env with a `Monitor` wrapper
<class 'sb3_contrib.common.wrappers.action_masker.ActionMasker'>
Wrapping the env in a DummyVecEnv.




Logging to ./ppo_betting_tensorboard/PPO_33
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2        |
|    ep_rew_mean     | -28.7    |
| time/              |          |
|    fps             | 27       |
|    iterations      | 1        |
|    time_elapsed    | 74       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2          |
|    ep_rew_mean          | -72.4      |
| time/                   |            |
|    fps                  | 22         |
|    iterations           | 2          |
|    time_elapsed         | 179        |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01130853 |
|    clip_fraction        | 0.0983     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.68      |
|    explained_variance   | 5.19e-06   |
|

KeyboardInterrupt: 

In [39]:
# =============================================
# Data Preparation Functions
# =============================================

def odds_to_implied_prob(odds):
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

def get_state(games_data, agent_context, max_games=15):
    state_sequence = []
    teams_list = []
    game_ids = games_data['GAME-ID'].unique()
    for idx, game_id in enumerate(game_ids):
        if idx >= max_games:
            break
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        if len(teams) < 2:
            continue
        team_features = []
        for team in teams:
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            features = [
                team_data['spread_mean'],
                team_data['SPREAD_LINE'],
                team_data['spread_value'],
                # team_data['spread_prob1'],
                # team_data['spread_prob2'],
                # team_data['spread_prob3'],
                # team_data['spread_prob4'],
                # team_data['spread_prob5'],
                team_data['implied_spread_odds'],
                team_data['spread_var']
            ]
            team_features.append(features)
            teams_list.append(team)
        if len(team_features) == 2:
            state_sequence.append(team_features)
    agent_features = np.array([
        agent_context['bankroll'] / agent_context['initial_bankroll'],
    ])
    return state_sequence, agent_features, teams_list

# =============================================
# Custom Environment Definition
# =============================================

class BettingEnv(gym.Env):
    metadata = {'render_modes': ['human']}

    def __init__(self, data, max_games=15):
        super(BettingEnv, self).__init__()
        self.data = data
        self.dates = self.data['DATE'].unique()
        self.max_games = max_games
        self.max_teams = self.max_games * 2

        # Define the action space
        self.action_space = spaces.MultiBinary(self.max_teams)
        self.observation_space = spaces.Dict({
            'state_sequence': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(self.max_games, 2, 5),
                dtype=np.float32
            ),
            'agent_features': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(1,),
                dtype=np.float32
            )
        })
        self.current_date_idx = -1
        self.current_date = None
        self.games_data = None
        self.agent_context = None
        self.state_sequence = None
        self.agent_features = None
        self.teams = None
        self.num_teams = None
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.seed(SEED)
        self.reset()

    def reset(self, seed=None, options=None):
        if seed is not None:
            self.seed(seed)
        self.current_date_idx = (self.current_date_idx + 1) % len(self.dates)
        self.current_date = self.dates[self.current_date_idx]
        self.games_data = self.data[self.data['DATE'] == self.current_date].reset_index(drop=True)
        self.agent_context = {
            'bankroll': 10000,
            'initial_bankroll': 10000
        }
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.state_sequence, self.agent_features, self.teams = get_state(self.games_data, self.agent_context, self.max_games)
        self.num_teams = len(self.teams)
        # Pad state_sequence to self.max_games
        num_games = len(self.state_sequence)
        if num_games < self.max_games:
            padding = [[[0]*5, [0]*5] for _ in range(self.max_games - num_games)]
            self.state_sequence.extend(padding)
        self.state_sequence = np.array(self.state_sequence, dtype=np.float32)
        # Pad self.teams to self.max_teams
        if self.num_teams < self.max_teams:
            padding = [None] * (self.max_teams - self.num_teams)
            self.teams.extend(padding)
        observation = self._get_observation()
        return observation, {}

    def step(self, action):
        self.step_count += 1
        reward = 0
        stake_per_bet = 100
        for idx in range(self.num_teams):
            team = self.teams[idx]
            if team is None:
                continue  # Skip padding teams
            if action[idx] == 1:
                team_data = self.games_data[self.games_data['TEAM'] == team].iloc[0]
                odds = team_data['SPREAD_LINE']
                decimal_odds = self.american_to_decimal_odds(odds)
                result = team_data['spread_result']
                self.agent_context['bankroll'] -= stake_per_bet  # Deduct the stake
                if result:
                    profit = stake_per_bet * (decimal_odds - 1)
                else:
                    profit = -stake_per_bet
                reward += profit
                self.agent_context['bankroll'] += stake_per_bet + profit  # Update bankroll
        self.agent_features = np.array([
            self.agent_context['bankroll'] / self.agent_context['initial_bankroll'],
        ])
        self.done = True  # Episode ends after one step (one day's slate)
        observation = self._get_observation()
        terminated = self.done
        truncated = False
        return observation, reward, terminated, truncated, {}

    def _get_observation(self):
        observation = {
            'state_sequence': self.state_sequence,
            'agent_features': self.agent_features.astype(np.float32)
        }
        return observation

    def render(self):
        pass

    def close(self):
        pass

    def seed(self, seed=None):
        set_random_seed(seed)

    def american_to_decimal_odds(self, odds):
        if odds > 0:
            return (odds / 100) + 1
        else:
            return (100 / abs(odds)) + 1

# =============================================
# Custom Policy Network Definition
# =============================================

class CustomPolicyNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, embedding_dim=128, num_heads=4, num_layers=2):
        self.max_games = observation_space['state_sequence'].shape[0]
        self.max_teams = self.max_games * 2
        self.output_dim = (self.max_teams * embedding_dim) + embedding_dim
        super(CustomPolicyNetwork, self).__init__(observation_space, features_dim=self.output_dim)
        self.team_feature_dim = observation_space['state_sequence'].shape[2]
        self.agent_feature_dim = observation_space['agent_features'].shape[0]
        self.team_embedding = nn.Linear(self.team_feature_dim, embedding_dim)
        self.cross_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.agent_embedding = nn.Linear(self.agent_feature_dim, embedding_dim)

    def forward(self, observations):
        state_sequence = observations['state_sequence']  # Shape: (batch_size, max_games, 2, num_features)
        agent_features = observations['agent_features']  # Shape: (batch_size, agent_feature_dim)
        if isinstance(state_sequence, np.ndarray):
            state_sequence = torch.tensor(state_sequence, dtype=torch.float32)
        if isinstance(agent_features, np.ndarray):
            agent_features = torch.tensor(agent_features, dtype=torch.float32)
        # Ensure batch dimension
        if state_sequence.dim() == 3:
            state_sequence = state_sequence.unsqueeze(0)
            agent_features = agent_features.unsqueeze(0)
        batch_size = state_sequence.shape[0]
        per_team_embeddings = []
        num_games = state_sequence.shape[1]
        for i in range(num_games):
            team_a_features = state_sequence[:, i, 0, :]  # Shape: (batch_size, num_features)
            team_b_features = state_sequence[:, i, 1, :]  # Shape: (batch_size, num_features)
            if torch.all(team_a_features == 0) and torch.all(team_b_features == 0):
                continue  # Skip padding games
            team_a_embedding = self.team_embedding(team_a_features)  # Shape: (batch_size, embedding_dim)
            team_b_embedding = self.team_embedding(team_b_features)
            # Reshape for multihead attention
            team_a_embedding = team_a_embedding.unsqueeze(0)  # Shape: (1, batch_size, embedding_dim)
            team_b_embedding = team_b_embedding.unsqueeze(0)
            # Cross attention
            attn_output_a, _ = self.cross_attention(team_a_embedding, team_b_embedding, team_b_embedding)
            attn_output_b, _ = self.cross_attention(team_b_embedding, team_a_embedding, team_a_embedding)
            per_team_embeddings.append(attn_output_a.squeeze(0))  # Shape: (batch_size, embedding_dim)
            per_team_embeddings.append(attn_output_b.squeeze(0))
        # Stack per_team_embeddings
        if per_team_embeddings:
            per_team_embeddings = torch.stack(per_team_embeddings, dim=1)  # Shape: (batch_size, num_teams, embedding_dim)
        else:
            # Handle case when no games
            per_team_embeddings = torch.zeros(batch_size, 0, self.team_embedding.out_features).to(agent_features.device)
        # Pad per_team_embeddings to self.max_teams
        num_teams = per_team_embeddings.shape[1]
        if num_teams < self.max_teams:
            padding = torch.zeros(batch_size, self.max_teams - num_teams, self.team_embedding.out_features).to(agent_features.device)
            per_team_embeddings = torch.cat([per_team_embeddings, padding], dim=1)
        # Transformer expects input of shape (seq_len, batch_size, embedding_dim)
        transformer_input = per_team_embeddings.permute(1, 0, 2)  # Shape: (self.max_teams, batch_size, embedding_dim)
        transformer_output = self.transformer_encoder(transformer_input)
        transformer_output = transformer_output.permute(1, 0, 2).reshape(batch_size, -1)  # Shape: (batch_size, self.max_teams * embedding_dim)
        agent_embedding = self.agent_embedding(agent_features)
        combined_input = torch.cat([transformer_output, agent_embedding], dim=1)
        return combined_input

# Load your dataset
data = train.groupby('DATE').filter(lambda x: (len(x) >= 4) and (len(x) <= 15))  # Adjusted to 15 games

# Create the environment
env = BettingEnv(data=data, max_games=15)

# Determine the device: 'cuda' if available, else 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Training on device: {device}")

# Define the policy_kwargs with your custom policy network
policy_kwargs = dict(
    features_extractor_class=CustomPolicyNetwork,
    features_extractor_kwargs=dict(
        embedding_dim=128,
        num_heads=4,
        num_layers=2,
    ),
)

from stable_baselines3.common.policies import MultiInputActorCriticPolicy

# Instantiate the PPO agent
model = PPO(
    policy=MultiInputActorCriticPolicy,
    env=env,
    learning_rate=0.0003,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./ppo_betting_tensorboard/",
    seed=SEED,
    device=device  # Specify the device here
)

# Train the agent
model.learn(total_timesteps=250000)

# Save the trained model
model.save('ppo_betting_agent')


Training on device: cuda
Using cuda device
Wrapping the env with a `Monitor` wrapper
<class '__main__.BettingEnv'>
Wrapping the env in a DummyVecEnv.




Logging to ./ppo_betting_tensorboard/PPO_35
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -47.5    |
| time/              |          |
|    fps             | 15       |
|    iterations      | 1        |
|    time_elapsed    | 133      |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 14.8        |
| time/                   |             |
|    fps                  | 13          |
|    iterations           | 2           |
|    time_elapsed         | 299         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.017979976 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.2         |
|    entropy_loss         | -20.8       |
|    explained_variance   | 

KeyboardInterrupt: 

In [None]:
# =============================================
# Evaluating the Trained Agent
# =============================================

obs, _ = env.reset()
done = False
total_reward = 0
while not done:
    action_masks = get_action_mask(env)
    action, _states = model.predict(obs, action_masks=action_masks)
    obs, reward, terminated, truncated, _ = env.step(action)
    total_reward += reward
    done = terminated or truncated
print("Total reward from the episode:", total_reward)

In [40]:
# =============================================
# Data Preparation Functions
# =============================================

def odds_to_implied_prob(odds):
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return abs(odds) / (abs(odds) + 100)

def get_state(games_data, agent_context, max_games=15):
    state_sequence = []
    teams_list = []
    game_ids = games_data['GAME-ID'].unique()
    for idx, game_id in enumerate(game_ids):
        if idx >= max_games:
            break
        game_df = games_data[games_data['GAME-ID'] == game_id]
        teams = game_df['TEAM'].unique()
        if len(teams) < 2:
            continue
        team_features = []
        for team in teams:
            team_data = game_df[game_df['TEAM'] == team].iloc[0]
            features = [
                team_data['spread_mean'],
                team_data['SPREAD_LINE'],
                team_data['spread_value'],
                team_data['spread_prob1'],
                team_data['spread_prob2'],
                team_data['spread_prob3'],
                team_data['spread_prob4'],
                team_data['spread_prob5'],
                team_data['implied_spread_odds'],
                team_data['spread_var']
            ]
            team_features.append(features)
            teams_list.append(team)
        if len(team_features) == 2:
            state_sequence.append(team_features)
    agent_features = np.array([
        agent_context['bankroll'] / agent_context['initial_bankroll'],
    ])
    return state_sequence, agent_features, teams_list

# =============================================
# Custom Environment Definition
# =============================================

class BettingEnv(gym.Env):
    metadata = {'render_modes': ['human']}

    def __init__(self, data, max_games=15, K=0.05):
        super(BettingEnv, self).__init__()
        self.data = data
        self.dates = self.data['DATE'].unique()
        self.max_games = max_games
        self.max_teams = self.max_games * 2
        self.K = K  # Threshold for minimum bet size

        # Define the action space: continuous values between 0 and 1
        self.action_space = spaces.Box(low=0.0, high=1.0, shape=(self.max_teams,), dtype=np.float32)
        self.observation_space = spaces.Dict({
            'state_sequence': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(self.max_games, 2, 10),
                dtype=np.float32
            ),
            'agent_features': spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(1,),
                dtype=np.float32
            )
        })
        self.current_date_idx = -1
        self.current_date = None
        self.games_data = None
        self.agent_context = None
        self.state_sequence = None
        self.agent_features = None
        self.teams = None
        self.num_teams = None
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.seed(SEED)
        self.reset()

    def reset(self, seed=None, options=None):
        if seed is not None:
            self.seed(seed)
        self.current_date_idx = (self.current_date_idx + 1) % len(self.dates)
        self.current_date = self.dates[self.current_date_idx]
        self.games_data = self.data[self.data['DATE'] == self.current_date].reset_index(drop=True)
        self.agent_context = {
            'bankroll': 10000,
            'initial_bankroll': 10000
        }
        self.total_stake = 0
        self.step_count = 0
        self.done = False
        self.state_sequence, self.agent_features, self.teams = get_state(self.games_data, self.agent_context, self.max_games)
        self.num_teams = len(self.teams)
        # Pad state_sequence to self.max_games
        num_games = len(self.state_sequence)
        if num_games < self.max_games:
            padding = [[[0]*10, [0]*10] for _ in range(self.max_games - num_games)]
            self.state_sequence.extend(padding)
        self.state_sequence = np.array(self.state_sequence, dtype=np.float32)
        # Pad self.teams to self.max_teams
        if self.num_teams < self.max_teams:
            padding = [None] * (self.max_teams - self.num_teams)
            self.teams.extend(padding)
        observation = self._get_observation()
        return observation, {}

    def step(self, action):
        self.step_count += 1
        reward = 0
        unit_stake = 100  # Define a unit stake
        for idx in range(self.num_teams):
            team = self.teams[idx]
            if team is None:
                continue  # Skip padding teams
            action_value = action[idx]
            if action_value < self.K:
                continue  # Bet size is zero, skip
            bet_size = action_value * unit_stake  # Calculate bet size
            team_data = self.games_data[self.games_data['TEAM'] == team].iloc[0]
            odds = team_data['SPREAD_LINE']
            decimal_odds = self.american_to_decimal_odds(odds)
            result = team_data['spread_result']
            self.agent_context['bankroll'] -= bet_size  # Deduct the stake
            if result:
                profit = bet_size * (decimal_odds - 1)
            else:
                profit = -bet_size
            reward += profit
            self.agent_context['bankroll'] += bet_size + profit  # Update bankroll
        self.agent_features = np.array([
            self.agent_context['bankroll'] / self.agent_context['initial_bankroll'],
        ])
        self.done = True  # Episode ends after one step (one day's slate)
        observation = self._get_observation()
        terminated = self.done
        truncated = False
        return observation, reward, terminated, truncated, {}

    def _get_observation(self):
        observation = {
            'state_sequence': self.state_sequence,
            'agent_features': self.agent_features.astype(np.float32)
        }
        return observation

    def render(self):
        pass

    def close(self):
        pass

    def seed(self, seed=None):
        set_random_seed(seed)

    def american_to_decimal_odds(self, odds):
        if odds > 0:
            return (odds / 100) + 1
        else:
            return (100 / abs(odds)) + 1

# =============================================
# Custom Policy Network Definition
# =============================================

class CustomPolicyNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, embedding_dim=128, num_heads=4, num_layers=2):
        self.max_games = observation_space['state_sequence'].shape[0]
        self.max_teams = self.max_games * 2
        super(CustomPolicyNetwork, self).__init__(observation_space, features_dim=1)  # features_dim will be updated later
        self.team_feature_dim = observation_space['state_sequence'].shape[2]
        self.agent_feature_dim = observation_space['agent_features'].shape[0]
        self.team_embedding = nn.Linear(self.team_feature_dim, embedding_dim)
        self.cross_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.agent_embedding = nn.Linear(self.agent_feature_dim, embedding_dim)
        # Output layer for action probabilities
        self.output_layer = nn.Linear(self.max_teams * embedding_dim + embedding_dim, self.max_teams)
        # Update features_dim
        self._features_dim = self.max_teams

    def forward(self, observations):
        state_sequence = observations['state_sequence']  # Shape: (batch_size, max_games, 2, num_features)
        agent_features = observations['agent_features']  # Shape: (batch_size, agent_feature_dim)
        if isinstance(state_sequence, np.ndarray):
            state_sequence = torch.tensor(state_sequence, dtype=torch.float32)
        if isinstance(agent_features, np.ndarray):
            agent_features = torch.tensor(agent_features, dtype=torch.float32)
        # Ensure batch dimension
        if state_sequence.dim() == 3:
            state_sequence = state_sequence.unsqueeze(0)
            agent_features = agent_features.unsqueeze(0)
        batch_size = state_sequence.shape[0]
        per_team_embeddings = []
        num_games = state_sequence.shape[1]
        for i in range(num_games):
            team_a_features = state_sequence[:, i, 0, :]  # Shape: (batch_size, num_features)
            team_b_features = state_sequence[:, i, 1, :]  # Shape: (batch_size, num_features)
            if torch.all(team_a_features == 0) and torch.all(team_b_features == 0):
                continue  # Skip padding games
            team_a_embedding = self.team_embedding(team_a_features)  # Shape: (batch_size, embedding_dim)
            team_b_embedding = self.team_embedding(team_b_features)
            # Reshape for multihead attention
            team_a_embedding = team_a_embedding.unsqueeze(0)  # Shape: (1, batch_size, embedding_dim)
            team_b_embedding = team_b_embedding.unsqueeze(0)
            # Cross attention
            attn_output_a, _ = self.cross_attention(team_a_embedding, team_b_embedding, team_b_embedding)
            attn_output_b, _ = self.cross_attention(team_b_embedding, team_a_embedding, team_a_embedding)
            per_team_embeddings.append(attn_output_a.squeeze(0))  # Shape: (batch_size, embedding_dim)
            per_team_embeddings.append(attn_output_b.squeeze(0))
        # Stack per_team_embeddings
        if per_team_embeddings:
            per_team_embeddings = torch.stack(per_team_embeddings, dim=1)  # Shape: (batch_size, num_teams, embedding_dim)
        else:
            # Handle case when no games
            per_team_embeddings = torch.zeros(batch_size, 0, self.team_embedding.out_features).to(agent_features.device)
        # Pad per_team_embeddings to self.max_teams
        num_teams = per_team_embeddings.shape[1]
        if num_teams < self.max_teams:
            padding = torch.zeros(batch_size, self.max_teams - num_teams, self.team_embedding.out_features).to(agent_features.device)
            per_team_embeddings = torch.cat([per_team_embeddings, padding], dim=1)
        # Transformer expects input of shape (seq_len, batch_size, embedding_dim)
        transformer_input = per_team_embeddings.permute(1, 0, 2)  # Shape: (self.max_teams, batch_size, embedding_dim)
        transformer_output = self.transformer_encoder(transformer_input)
        transformer_output = transformer_output.permute(1, 0, 2).reshape(batch_size, -1)  # Shape: (batch_size, self.max_teams * embedding_dim)
        agent_embedding = self.agent_embedding(agent_features)
        combined_input = torch.cat([transformer_output, agent_embedding], dim=1)  # Shape: (batch_size, total_embedding_dim)
        # Output layer to get action values
        action_values = self.output_layer(combined_input)  # Shape: (batch_size, self.max_teams)
        # Apply sigmoid to get values between 0 and 1
        action_probs = torch.sigmoid(action_values)
        return action_probs  # Shape: (batch_size, self.max_teams)

# Load your dataset
data = train.groupby('DATE').filter(lambda x: (len(x) >= 4) and (len(x) <= 15))  # Adjusted to 15 games

# Create the environment
env = BettingEnv(data=data, max_games=15, K=0.05)  # Set threshold K as desired

# Determine the device: 'cuda' if available, else 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Training on device: {device}")

# Define the policy_kwargs with your custom policy network
policy_kwargs = dict(
    features_extractor_class=CustomPolicyNetwork,
    features_extractor_kwargs=dict(
        embedding_dim=128,
        num_heads=4,
        num_layers=2,
    ),
    # Set net_arch to an empty list because the output is handled in the features extractor
    net_arch=[],
)

# Import PPO
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy, MultiInputActorCriticPolicy

# Custom ActorCriticPolicy to use our CustomPolicyNetwork
class CustomActorCriticPolicy(MultiInputActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomActorCriticPolicy, self).__init__(*args, **kwargs)

    def _build_mlp_extractor(self):
        # The policy network output is already the action probabilities
        pass  # No need to build additional networks

# Instantiate the PPO agent
model = PPO(
    policy=CustomActorCriticPolicy,
    env=env,
    learning_rate=0.0003,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="./ppo_betting_tensorboard/",
    seed=SEED,
    device=device  # Specify the device here
)

# Train the agent
model.learn(total_timesteps=250000)

# Save the trained model
model.save('ppo_betting_agent')


Training on device: cuda
Using cuda device
Wrapping the env with a `Monitor` wrapper
<class '__main__.BettingEnv'>
Wrapping the env in a DummyVecEnv.


AttributeError: 'CustomActorCriticPolicy' object has no attribute 'mlp_extractor'