In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("oddsData.csv")
df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal
0,2007-10-30,2008,Utah,@,Golden State,117,96,100.0,-120.0,212.0,1.0,105.5
1,2007-10-30,2008,LA Lakers,vs,Houston,93,95,190.0,-230.0,199.0,5.0,99.0
2,2007-10-30,2008,Houston,@,LA Lakers,95,93,-230.0,190.0,199.0,-5.0,99.0
3,2007-10-30,2008,San Antonio,vs,Portland,106,97,-1400.0,900.0,189.5,-13.0,95.0
4,2007-10-30,2008,Portland,@,San Antonio,97,106,900.0,-1400.0,189.5,13.0,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,Toronto,@,New York,123,121,100.0,-120.0,218.5,1.5,106.5
37100,2023-01-16,2023,Memphis,vs,Phoenix,136,106,-800.0,550.0,231.0,-12.5,118.0
37101,2023-01-16,2023,New York,vs,Toronto,121,123,-120.0,100.0,218.5,-1.5,106.5
37102,2023-01-16,2023,Minnesota,vs,Utah,125,126,-175.0,155.0,232.0,-4.0,119.0


In [3]:
df['win'] = df['score'] < df['opponentScore']

In [4]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = []
def label_encode(column, targets=None):
    if targets == None:
        targets = [column]

    encoder = LabelEncoder()
    encoder.fit(df[column])

    for target in targets:
        df[target] = encoder.transform(df[target])

    categorical_cols.extend(targets)

label_encode('team', ['team', 'opponent'])
label_encode('home/visitor')
label_encode('win')

df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal,win
0,2007-10-30,2008,30,0,9,117,96,100.0,-120.0,212.0,1.0,105.5,0
1,2007-10-30,2008,13,1,10,93,95,190.0,-230.0,199.0,5.0,99.0,1
2,2007-10-30,2008,10,0,13,95,93,-230.0,190.0,199.0,-5.0,99.0,0
3,2007-10-30,2008,27,1,25,106,97,-1400.0,900.0,189.5,-13.0,95.0,0
4,2007-10-30,2008,25,0,27,97,106,900.0,-1400.0,189.5,13.0,95.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,29,0,20,123,121,100.0,-120.0,218.5,1.5,106.5,0
37100,2023-01-16,2023,14,1,24,136,106,-800.0,550.0,231.0,-12.5,118.0,0
37101,2023-01-16,2023,20,1,29,121,123,-120.0,100.0,218.5,-1.5,106.5,1
37102,2023-01-16,2023,17,1,30,125,126,-175.0,155.0,232.0,-4.0,119.0,1


In [5]:
# change date column to numpy.dattetime64 object
df['date'] = pd.to_datetime(df['date'])

# split datetime object into year, month, day, day of week columns
# Monday = 0, Sunday = 6
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal,win,year,month,day,day_of_week
0,2007-10-30,2008,30,0,9,117,96,100.0,-120.0,212.0,1.0,105.5,0,2007,10,30,1
1,2007-10-30,2008,13,1,10,93,95,190.0,-230.0,199.0,5.0,99.0,1,2007,10,30,1
2,2007-10-30,2008,10,0,13,95,93,-230.0,190.0,199.0,-5.0,99.0,0,2007,10,30,1
3,2007-10-30,2008,27,1,25,106,97,-1400.0,900.0,189.5,-13.0,95.0,0,2007,10,30,1
4,2007-10-30,2008,25,0,27,97,106,900.0,-1400.0,189.5,13.0,95.0,1,2007,10,30,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,29,0,20,123,121,100.0,-120.0,218.5,1.5,106.5,0,2023,1,16,0
37100,2023-01-16,2023,14,1,24,136,106,-800.0,550.0,231.0,-12.5,118.0,0,2023,1,16,0
37101,2023-01-16,2023,20,1,29,121,123,-120.0,100.0,218.5,-1.5,106.5,1,2023,1,16,0
37102,2023-01-16,2023,17,1,30,125,126,-175.0,155.0,232.0,-4.0,119.0,1,2023,1,16,0


In [6]:
# calculate win rate of the team considering all games before the current game in this season
df['games_played'] = df.groupby(['team', 'season']).cumcount()
df['cumulative_wins'] = df.groupby(['team', 'season'])['win'].cumsum() - df['win']
df['cumulative_win_rate'] = df['cumulative_wins'] / df['games_played']
df['cumulative_win_rate'] = df['cumulative_win_rate'].fillna(0)  # Handle first game of the season
df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,spread,secondHalfTotal,win,year,month,day,day_of_week,games_played,cumulative_wins,cumulative_win_rate
0,2007-10-30,2008,30,0,9,117,96,100.0,-120.0,212.0,1.0,105.5,0,2007,10,30,1,0,0,0.000000
1,2007-10-30,2008,13,1,10,93,95,190.0,-230.0,199.0,5.0,99.0,1,2007,10,30,1,0,0,0.000000
2,2007-10-30,2008,10,0,13,95,93,-230.0,190.0,199.0,-5.0,99.0,0,2007,10,30,1,0,0,0.000000
3,2007-10-30,2008,27,1,25,106,97,-1400.0,900.0,189.5,-13.0,95.0,0,2007,10,30,1,0,0,0.000000
4,2007-10-30,2008,25,0,27,97,106,900.0,-1400.0,189.5,13.0,95.0,1,2007,10,30,1,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,29,0,20,123,121,100.0,-120.0,218.5,1.5,106.5,0,2023,1,16,0,43,24,0.558140
37100,2023-01-16,2023,14,1,24,136,106,-800.0,550.0,231.0,-12.5,118.0,0,2023,1,16,0,42,13,0.309524
37101,2023-01-16,2023,20,1,29,121,123,-120.0,100.0,218.5,-1.5,106.5,1,2023,1,16,0,44,19,0.431818
37102,2023-01-16,2023,17,1,30,125,126,-175.0,155.0,232.0,-4.0,119.0,1,2023,1,16,0,44,22,0.500000


In [7]:
# calculate average score of the team considering all games before the current game in this season
df['cumulative_score'] = df.groupby(['team', 'season'])['score'].cumsum() - df['score']
df['average_score'] = df['cumulative_score'] / df['games_played']
df['average_score'] = df['average_score'].fillna(0)  # Handle first game of the season
df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,...,win,year,month,day,day_of_week,games_played,cumulative_wins,cumulative_win_rate,cumulative_score,average_score
0,2007-10-30,2008,30,0,9,117,96,100.0,-120.0,212.0,...,0,2007,10,30,1,0,0,0.000000,0,0.000000
1,2007-10-30,2008,13,1,10,93,95,190.0,-230.0,199.0,...,1,2007,10,30,1,0,0,0.000000,0,0.000000
2,2007-10-30,2008,10,0,13,95,93,-230.0,190.0,199.0,...,0,2007,10,30,1,0,0,0.000000,0,0.000000
3,2007-10-30,2008,27,1,25,106,97,-1400.0,900.0,189.5,...,0,2007,10,30,1,0,0,0.000000,0,0.000000
4,2007-10-30,2008,25,0,27,97,106,900.0,-1400.0,189.5,...,1,2007,10,30,1,0,0,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,29,0,20,123,121,100.0,-120.0,218.5,...,0,2023,1,16,0,43,24,0.558140,4806,111.767442
37100,2023-01-16,2023,14,1,24,136,106,-800.0,550.0,231.0,...,0,2023,1,16,0,42,13,0.309524,4919,117.119048
37101,2023-01-16,2023,20,1,29,121,123,-120.0,100.0,218.5,...,1,2023,1,16,0,44,19,0.431818,5021,114.113636
37102,2023-01-16,2023,17,1,30,125,126,-175.0,155.0,232.0,...,1,2023,1,16,0,44,22,0.500000,5051,114.795455


In [8]:
# calculate win rate of the team versus the opponent considering all games versus this opponent before the current game in this season
# is this necessary?
df['opponent_games_played'] = df.groupby(['team', 'season', 'opponent']).cumcount()
df['opponent_cumulative_wins'] = df.groupby(['team', 'season', 'opponent'])['win'].cumsum() - df['win']
df['opponent_win_rate'] = df['opponent_cumulative_wins'] / df['opponent_games_played']
df['opponent_win_rate'] = df['opponent_win_rate'].fillna(0)
df

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,...,day,day_of_week,games_played,cumulative_wins,cumulative_win_rate,cumulative_score,average_score,opponent_games_played,opponent_cumulative_wins,opponent_win_rate
0,2007-10-30,2008,30,0,9,117,96,100.0,-120.0,212.0,...,30,1,0,0,0.000000,0,0.000000,0,0,0.0
1,2007-10-30,2008,13,1,10,93,95,190.0,-230.0,199.0,...,30,1,0,0,0.000000,0,0.000000,0,0,0.0
2,2007-10-30,2008,10,0,13,95,93,-230.0,190.0,199.0,...,30,1,0,0,0.000000,0,0.000000,0,0,0.0
3,2007-10-30,2008,27,1,25,106,97,-1400.0,900.0,189.5,...,30,1,0,0,0.000000,0,0.000000,0,0,0.0
4,2007-10-30,2008,25,0,27,97,106,900.0,-1400.0,189.5,...,30,1,0,0,0.000000,0,0.000000,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,2023,29,0,20,123,121,100.0,-120.0,218.5,...,16,0,43,24,0.558140,4806,111.767442,2,1,0.5
37100,2023-01-16,2023,14,1,24,136,106,-800.0,550.0,231.0,...,16,0,42,13,0.309524,4919,117.119048,2,1,0.5
37101,2023-01-16,2023,20,1,29,121,123,-120.0,100.0,218.5,...,16,0,44,19,0.431818,5021,114.113636,2,1,0.5
37102,2023-01-16,2023,17,1,30,125,126,-175.0,155.0,232.0,...,16,0,44,22,0.500000,5051,114.795455,2,1,0.5


In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def standardize(column):
    df[column] -= df[column].mean()
    df[column] /= df[column].std()

def normalize(column):
    df[column] -= df[column].min()
    df[column] /= df[column].max()

In [10]:
from scipy.stats import shapiro
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    result = shapiro(df[col].sample(1000, random_state=42))
    stat, p_val = result.statistic, result.pvalue
    print(f'Variable {col}, Statistic: {stat}, p-value: {p_val}')
    if (p_val < 0.05):
        normalize(col)
        print(f'{col} is not normally distributed')
    else:
        standardize(col)
        print(f'{col} is normally distributed')

    print()

df

Variable season, Statistic: 0.9489112496376038, p-value: 3.997433855404374e-18
season is not normally distributed

Variable team, Statistic: 0.9510927200317383, p-value: 1.0190652561235875e-17
team is not normally distributed

Variable home/visitor, Statistic: 0.6365317702293396, p-value: 1.3682278205667514e-41
home/visitor is not normally distributed

Variable opponent, Statistic: 0.959510087966919, p-value: 5.123296780917765e-16
opponent is not normally distributed

Variable score, Statistic: 0.9973709583282471, p-value: 0.10523498803377151
score is normally distributed

Variable opponentScore, Statistic: 0.9980179667472839, p-value: 0.29022809863090515
opponentScore is normally distributed

Variable moneyLine, Statistic: 0.6321099996566772, p-value: 9.11965040582591e-42
moneyLine is not normally distributed

Variable opponentMoneyLine, Statistic: 0.7641377449035645, p-value: 1.5213425245644e-35
opponentMoneyLine is not normally distributed

Variable total, Statistic: 0.9867267608642

Unnamed: 0,date,season,team,home/visitor,opponent,score,opponentScore,moneyLine,opponentMoneyLine,total,...,day,day_of_week,games_played,cumulative_wins,cumulative_win_rate,cumulative_score,average_score,opponent_games_played,opponent_cumulative_wins,opponent_win_rate
0,2007-10-30,0.0,0.967742,0.0,0.290323,0.975165,-0.603974,0.671795,0.660513,0.515528,...,0.966667,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,2007-10-30,0.0,0.419355,1.0,0.322581,-0.829566,-0.679171,0.676410,0.654872,0.354037,...,0.966667,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,2007-10-30,0.0,0.322581,0.0,0.419355,-0.679171,-0.829566,0.654872,0.676410,0.354037,...,0.966667,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,2007-10-30,0.0,0.870968,1.0,0.806452,0.147997,-0.528777,0.594872,0.712821,0.236025,...,0.966667,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,2007-10-30,0.0,0.806452,0.0,0.870968,-0.528777,0.147997,0.712821,0.594872,0.236025,...,0.966667,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37099,2023-01-16,1.0,0.935484,0.0,0.645161,1.426347,1.275953,0.671795,0.660513,0.596273,...,0.500000,0.000000,0.530864,0.338028,0.558140,0.502194,0.798339,0.666667,0.333333,0.5
37100,2023-01-16,1.0,0.451613,1.0,0.774194,2.403910,0.147997,0.625641,0.694872,0.751553,...,0.500000,0.000000,0.518519,0.183099,0.309524,0.514002,0.836565,0.666667,0.333333,0.5
37101,2023-01-16,1.0,0.645161,1.0,0.935484,1.275953,1.426347,0.660513,0.671795,0.596273,...,0.500000,0.000000,0.543210,0.267606,0.431818,0.524660,0.815097,0.666667,0.333333,0.5
37102,2023-01-16,1.0,0.548387,1.0,0.967742,1.576742,1.651939,0.657692,0.674615,0.763975,...,0.500000,0.000000,0.543210,0.309859,0.500000,0.527795,0.819968,0.666667,0.333333,0.5


In [11]:
input_cols = ['date', 'season', 'team', 'home/visitor', 'opponent', 'day', 'day_of_week', 'month',
              'games_played', 'cumulative_wins', 'cumulative_win_rate', 'cumulative_score', 'average_score', 
              'opponent_games_played', 'opponent_cumulative_wins', 'opponent_win_rate']

output_cols = ['moneyLine', 'total', 'spread', 'secondHalfTotal', 'score', 'win']

In [12]:
periodic_cols = ['day', 'day_of_week', 'month']

for col in periodic_cols:
    df['sin_' + col] = np.sin(np.pi * df[col])

In [13]:
from sklearn.preprocessing import PolynomialFeatures

expand_cols = []

poly = PolynomialFeatures(3)
for col in expand_cols:
    poly.fit_transform(col)

In [14]:
df[input_cols].to_csv("inputs.csv", index=False)
df[output_cols].to_csv("outputs.csv", index=False)