In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.learning_curve import validation_curve
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, mean_squared_error, mean_squared_log_error
from sklearn.cluster import KMeans
import statsmodels.api as sm
import math
from scipy import stats
from scipy.stats import boxcox
from scipy.special import boxcox1p




In [2]:
df = pd.read_csv('nfl_betting_df.csv', index_col = 0)

df['exp_win_difference'] = df.home_exp_win_pct- df.away_exp_win_pct
df['schedule_date']=df['schedule_date'].apply(pd.to_datetime)
df = df[df.schedule_season > 1978]

df['total_ppg'] = (df.h_ppg + df.a_ppg)
df['total_points_against'] = (df.h_papg + df.a_papg)
df['last_four_difference'] = (df.home_win_pct_last_4 - df.away_win_pct_last_4)

df[['weather_temperature', 'weather_wind_mph', 'weather_humidity']]=df[['weather_temperature', 'weather_wind_mph', 'weather_humidity']].fillna(value=np.round(df.mean(),3))
df[['over_under_line']]=df[['over_under_line']].fillna(value=(np.round(df.mean()*2)/2))

df['over_under_result'] = np.where(df['over_under_result']=='under', 0, df.over_under_result)
df['over_under_result'] = np.where(df['over_under_result']=='over', 1, df.over_under_result)
df['over_under_result'] = np.where(df['over_under_result']=='push', 2, df.over_under_result)

df['dome'] = np.where(df['weather_detail']=='DOME', 1, 0)

df = df.fillna(value = 0)

df.index = range(0, len(df))

reg_df = df[(df.schedule_week > 1) & (df.schedule_week < 19) & (df.schedule_season>2001)]
week5_df = df[(df.schedule_week > 4) & (df.schedule_week < 17) & (df.schedule_season>2001)]

# Cluster Predictions

In [15]:
cluster_df = pd.read_csv('cluster_df.csv',index_col=0)

good_o_good_d = cluster_df[cluster_df.cluster==2]
bad_o_bad_d = cluster_df[cluster_df.cluster==0]
bad_o_good_d = cluster_df[cluster_df.cluster==1]
good_o_bad_d = cluster_df[cluster_df.cluster==3]
cluster_df

Unnamed: 0,over_under_result,spread_favorite,home_favorite,last_four_difference,exp_win_difference,score_home,home_losses,spread_favorite.1,away_exp_win_pct,away_pt_diff_pg,...,total_ppg,total_points_against,weather_humidity,point_total,weather_temperature,schedule_season,a_papg,h_papg,per_game_difference,cluster
5431,0,-2.5,1,-0.50,-0.106576,6,2,-2.5,0.988403,14.750,...,52.17,29.75,66.552,26,72.0,2002,11.75,18.00,-0.83,2
5432,1,-3.0,0,-0.25,-0.469109,31,2,-3.0,0.981043,18.000,...,70.67,52.42,69.000,80,55.0,2002,19.67,32.75,-4.67,3
5433,0,-5.0,1,0.25,0.282943,13,1,-5.0,0.652895,2.250,...,39.75,29.75,72.000,29,71.0,2002,18.50,11.25,-1.75,1
5434,1,-6.0,1,0.25,0.533319,21,2,-6.0,0.139544,-5.667,...,39.42,42.08,68.000,47,57.0,2002,19.33,22.75,12.08,1
5435,1,-1.5,0,0.00,-0.217037,17,2,-1.5,0.316216,-2.250,...,28.00,37.50,55.000,38,78.0,2002,16.00,21.50,0.50,1
5436,0,-6.5,1,-0.25,-0.330077,26,1,-6.5,0.994859,16.000,...,50.00,31.25,43.000,35,53.0,2002,9.50,21.75,-1.00,2
5437,1,-13.5,1,0.50,0.810391,28,1,-13.5,0.000151,-24.000,...,27.08,46.08,66.552,49,72.0,2002,29.75,16.33,15.58,0
5438,1,-3.0,0,-0.25,-0.054563,28,1,-3.0,0.990207,19.000,...,60.33,31.67,79.000,53,81.0,2002,16.00,15.67,-9.67,2
5439,0,-3.0,1,0.00,-0.017698,26,1,-3.0,0.944540,12.250,...,64.75,41.25,78.000,39,82.0,2002,20.00,21.25,0.25,2
5440,1,-3.0,1,0.50,0.704727,32,1,-3.0,0.080456,-8.667,...,43.42,46.58,66.552,61,72.0,2002,24.33,22.25,12.08,3


In [16]:
atl_phi = {'home_id':'PHI','away_id':'ATL', 'over_under':47.5,
          'TPAPG':20.33+19.27,'TPPG':22.07+30.47,'temp':83,'wind':7,'humidity':66.552,'dome':0,'date':'9/6/18','time':'night'}
pit_cle = {'home_id':'CLE','away_id':'PIT', 'over_under':48,
          'TPAPG':18.93+25.47,'TPPG':25.2+14,'temp':67,'wind':7,'humidity':66.552,'dome':0,'date':'9/9/18','time':'1pm'}
sf_min = {'home_id':'MIN','away_id':'SF', 'over_under':47.5,
          'TPAPG':24.67+16.16,'TPPG':19.8+23.93,'temp':72,'wind':0,'humidity':66.552,'dome':1,'date':'9/9/18','time':'1pm'}
cin_ind = {'home_id':'IND', 'away_id':'CIN', 'over_under':47.5,
          'TPAPG':26.07+21.47,'TPPG':16+17.27,'temp':72,'wind':0,'humidity':66.552,'dome':1,'date':'9/9/18','time':'1pm'}
buf_bal = {'home_id':'BAL', 'away_id':'BUF', 'over_under':42,
          'TPAPG':18.13+22.87,'TPPG':18.67+24.53,'temp':73.5,'wind':3.5,'humidity':66.552,'dome':0,'date':'9/9/18','time':'1pm'}
jax_nyg = {'home_id':'NYG', 'away_id':'JAX', 'over_under':44.5,
          'TPAPG':25.2+16.87,'TPPG':15.2+27.13,'temp':72,'wind':7,'humidity':66.552,'dome':0,'date':'9/9/18','time':'1pm'}
tb_no = {'home_id':'NO', 'away_id':'TB', 'over_under':52.5,
          'TPAPG':23.87+19.67,'TPPG':20.27+28.27,'temp':72,'wind':0,'humidity':66.552,'dome':1,'date':'9/9/18','time':'1pm'}
hou_ne = {'home_id':'NE', 'away_id':'HOU', 'over_under':51,
          'TPAPG':19.33+27.6,'TPPG':28.8+21.67,'temp':71.67,'wind':6.67,'humidity':66.552,'dome':0,'date':'9/9/18','time':'1pm'}
ten_mia = {'home_id':'MIA', 'away_id':'TEN', 'over_under':47,
          'TPAPG':24.73+23.07,'TPPG':17.67+21.27,'temp':85,'wind':9,'humidity':66.552,'dome':0,'date':'9/9/18','time':'1pm'}
kc_lac = {'home_id':'LAC', 'away_id':'KC', 'over_under':48.5,
          'TPAPG':17.47+21,'TPPG':25.87+21.67,'temp':79,'wind':6.33,'humidity':66.552,'dome':0,'date':'9/9/18','time':'4pm'}
sea_den = {'home_id':'DEN', 'away_id':'SEA', 'over_under':42,
          'TPAPG':20.33+23.67,'TPPG':17.67+22.8,'temp':74.67,'wind':5.67,'humidity':66.552,'dome':0,'date':'9/9/18','time':'4pm'}
dal_car = {'home_id':'CAR', 'away_id':'DAL', 'over_under':42,
          'TPAPG':22.13+20.33,'TPPG':23.2+23.53,'temp':82,'wind':5,'humidity':66.552,'dome':0,'date':'9/9/18','time':'4pm'}
was_ari = {'home_id':'ARI', 'away_id':'WAS', 'over_under':45,
          'TPAPG':24.67+22.47,'TPPG':22.13+17.93,'temp':72,'wind':0,'humidity':66.552,'dome':1,'date':'9/9/18','time':'4pm'}
chi_gb = {'home_id':'GB', 'away_id':'CHI', 'over_under':48.5,
          'TPAPG':23.27+19.8,'TPPG':20.6+16.93,'temp':71,'wind':8.33,'humidity':66.552,'dome':0,'date':'9/9/18','time':'night'}
nyj_det = {'home_id':'DET', 'away_id':'NYJ', 'over_under':48.5,
          'TPAPG':24.33+23.73,'TPPG':25+19.47,'temp':72,'wind':0,'humidity':66.552,'dome':1,'date':'9/10/18','time':'night'}
lar_oak = {'home_id':'OAK', 'away_id':'LAR', 'over_under':49.5,
          'TPAPG':19.67+22.87,'TPPG':31+19.4,'temp':69,'wind':6,'humidity':66.552,'dome':0,'date':'9/10/18','time':'night'}

week1_games = [atl_phi,pit_cle,sf_min,cin_ind,buf_bal,jax_nyg,tb_no,hou_ne,ten_mia,kc_lac,sea_den,dal_car,was_ari,chi_gb,nyj_det,lar_oak]
w1 = pd.DataFrame(week1_games)
w1 = w1.drop(columns=['humidity'])

In [17]:
w1['good_o_good_d']= np.where((w1.TPPG>=48) & (w1.TPAPG<=48),1,0)
w1['good_o_bad_d']= np.where((w1.TPPG>=48) & (w1.TPAPG>=48),1,0)
w1['bad_o_good_d']= np.where((w1.TPPG<=48) & (w1.TPAPG<=48),1,0)
w1['bad_o_bad_d']= np.where((w1.TPPG<=48) & (w1.TPAPG>=48),1,0)

In [18]:
def week_one_cluster_predictions(df):
    y_preds = []
    for index, row in df.iterrows():
        if row.bad_o_good_d == 1:
            cluster_name = bad_o_good_d

            yt,max_lambda =boxcox(cluster_name.over_under_line)
            xt1,x1_lam=boxcox(cluster_name.total_points_against)
            xt2,x2_lam=boxcox(cluster_name.total_ppg)
            xt3,x3_lam=boxcox(cluster_name.weather_temperature+1)
            xt4,x4_lam=boxcox(cluster_name.weather_wind_mph+1)
            
            papg = boxcox1p(row.TPAPG,x1_lam)
            ppg = boxcox1p(row.TPPG,x2_lam)
            temp = boxcox1p(row.temp,x3_lam)
            wind = boxcox1p(row.wind,x4_lam)
            
            y = 1.409371e+01 + 4.455859e-07*papg +2.866647e-06*ppg + 2.335357e-03*temp + -9.971634e-02*wind
            y_pred = np.round(untransform(y,max_lambda)*2)/2
            y_preds.append(y_pred)
            
        if row.bad_o_bad_d == 1:
            cluster_name = bad_o_bad_d

            yt,max_lambda =boxcox(cluster_name.over_under_line)
            xt1,x1_lam=boxcox(cluster_name.total_points_against)
            xt2,x2_lam=boxcox(cluster_name.total_ppg)
            xt3,x3_lam=boxcox(cluster_name.weather_temperature+1)
            xt4,x4_lam=boxcox(cluster_name.weather_wind_mph+1)
            
            papg = boxcox1p(row.TPAPG,x1_lam)
            ppg = boxcox1p(row.TPPG,x2_lam)
            temp = boxcox1p(row.temp,x3_lam)
            wind = boxcox1p(row.wind,x4_lam)
            
            y = -4.895784e+07 + 2.018292e+08*papg +7.046641e-05*ppg + 1.220436e-03*temp + -4.167669e-01*wind
            y_pred = np.round(untransform(y,max_lambda)*2)/2
            y_preds.append(y_pred)
        
        if row.good_o_bad_d == 1:
            cluster_name = good_o_bad_d

            yt,max_lambda =boxcox(cluster_name.over_under_line)
            xt1,x1_lam=boxcox(cluster_name.total_points_against)
            xt2,x2_lam=boxcox(cluster_name.total_ppg)
            xt3,x3_lam=boxcox(cluster_name.weather_temperature+1)
            xt4,x4_lam=boxcox(cluster_name.weather_wind_mph+1)
            
            papg = boxcox1p(row.TPAPG,x1_lam)
            ppg = boxcox1p(row.TPPG,x2_lam)
            temp = boxcox1p(row.temp,x3_lam)
            wind = boxcox1p(row.wind,x4_lam)
            
            y = -1.633614e+07 + 4.488241e+06*papg + 7.210020e+07*ppg + 3.264604e-05*temp + -2.055851e-02*wind
            y_pred = np.round(untransform(y,max_lambda)*2)/2
            y_preds.append(y_pred)
            
        if row.good_o_good_d == 1:
            cluster_name = good_o_good_d

            yt,max_lambda =boxcox(cluster_name.over_under_line)
            xt1,x1_lam=boxcox(cluster_name.total_points_against)
            xt2,x2_lam=boxcox(cluster_name.total_ppg)
            xt3,x3_lam=boxcox(cluster_name.weather_temperature+1)
            xt4,x4_lam=boxcox(cluster_name.weather_wind_mph+1)
            
            papg = boxcox1p(row.TPAPG,x1_lam)
            ppg = boxcox1p(row.TPPG,x2_lam)
            temp = boxcox1p(row.temp,x3_lam)
            wind = boxcox1p(row.wind,x4_lam)
            
            y = -6.763740e+07 + 4.999671e-06*papg + 2.929533e+08*ppg + 1.018648e-03*temp + -2.310345e-01*wind
            y_pred = np.round(untransform(y,max_lambda)*2)/2
            y_preds.append(y_pred)
            
    return y_preds

In [19]:
def untransform(arr, lambda_):
    result = np.exp(np.log(lambda_ * arr + 1) / lambda_)
    return result

In [20]:
w1_preds = week_one_cluster_predictions(w1)
w1['prediction'] = w1_preds
w1['pred-actual'] = w1.prediction- w1.over_under
w1

Unnamed: 0,TPAPG,TPPG,away_id,date,dome,home_id,over_under,temp,time,wind,good_o_good_d,good_o_bad_d,bad_o_good_d,bad_o_bad_d,prediction,pred-actual
0,39.6,52.54,ATL,9/6/18,0,PHI,47.5,83.0,night,7.0,1,0,0,0,67.0,19.5
1,44.4,39.2,PIT,9/9/18,0,CLE,48.0,67.0,1pm,7.0,0,0,1,0,43.0,-5.0
2,40.83,43.73,SF,9/9/18,1,MIN,47.5,72.0,1pm,0.0,0,0,1,0,45.0,-2.5
3,47.54,33.27,CIN,9/9/18,1,IND,47.5,72.0,1pm,0.0,0,0,1,0,44.0,-3.5
4,41.0,43.2,BUF,9/9/18,0,BAL,42.0,73.5,1pm,3.5,0,0,1,0,44.0,2.0
5,42.07,42.33,JAX,9/9/18,0,NYG,44.5,72.0,1pm,7.0,0,0,1,0,43.5,-1.0
6,43.54,48.54,TB,9/9/18,1,NO,52.5,72.0,1pm,0.0,1,0,0,0,68.5,16.0
7,46.93,50.47,HOU,9/9/18,0,NE,51.0,71.67,1pm,6.67,1,0,0,0,70.0,19.0
8,47.8,38.94,TEN,9/9/18,0,MIA,47.0,85.0,1pm,9.0,0,0,1,0,46.0,-1.0
9,38.47,47.54,KC,9/9/18,0,LAC,48.5,79.0,4pm,6.33,0,0,1,0,46.5,-2.0


# Non-Cluster Predictions

In [21]:
def week_one_predictions(df):
    y_preds = []
    for index, row in df.iterrows():

        yt,max_lambda =boxcox(week5_df.over_under_line)
        xt1,x1_lam=boxcox(week5_df.total_points_against)
        xt2,x2_lam=boxcox(week5_df.total_ppg)
        xt3,x3_lam=boxcox(week5_df.weather_temperature+1)
        xt4,x4_lam=boxcox(week5_df.weather_wind_mph+1)
        xt7,xlam7=boxcox(week5_df.schedule_season)

        papg = boxcox1p(row.TPAPG,x1_lam)
        ppg = boxcox1p(row.TPPG,x2_lam)
        temp = boxcox1p(row.temp,x3_lam)
        wind = boxcox1p(row.wind,x4_lam)
        dome = row.dome
        season = boxcox1p(2018,xlam7)
        
        y = -2.575851e+06 + 6.509747e-02*papg +3.615986e-01*ppg + 1.505438e-04*temp + -2.959061e-02*wind + 1.402209e-01*dome +4.085226e+06*season 
        y_pred = np.round(untransform(y,max_lambda)*2)/2
        y_preds.append(y_pred)
            
    return y_preds

In [22]:
reg_w1 = w1
reg_w1_preds = week_one_predictions(reg_w1)
reg_w1['prediction'] = reg_w1_preds
reg_w1['pred-actual'] = reg_w1.prediction- reg_w1.over_under
reg_w1

Unnamed: 0,TPAPG,TPPG,away_id,date,dome,home_id,over_under,temp,time,wind,good_o_good_d,good_o_bad_d,bad_o_good_d,bad_o_bad_d,prediction,pred-actual
0,39.6,52.54,ATL,9/6/18,0,PHI,47.5,83.0,night,7.0,1,0,0,0,47.0,-0.5
1,44.4,39.2,PIT,9/9/18,0,CLE,48.0,67.0,1pm,7.0,0,0,1,0,42.0,-6.0
2,40.83,43.73,SF,9/9/18,1,MIN,47.5,72.0,1pm,0.0,0,0,1,0,45.0,-2.5
3,47.54,33.27,CIN,9/9/18,1,IND,47.5,72.0,1pm,0.0,0,0,1,0,42.0,-5.5
4,41.0,43.2,BUF,9/9/18,0,BAL,42.0,73.5,1pm,3.5,0,0,1,0,43.5,1.5
5,42.07,42.33,JAX,9/9/18,0,NYG,44.5,72.0,1pm,7.0,0,0,1,0,43.0,-1.5
6,43.54,48.54,TB,9/9/18,1,NO,52.5,72.0,1pm,0.0,1,0,0,0,48.0,-4.5
7,46.93,50.47,HOU,9/9/18,0,NE,51.0,71.67,1pm,6.67,1,0,0,0,48.0,-3.0
8,47.8,38.94,TEN,9/9/18,0,MIA,47.0,85.0,1pm,9.0,0,0,1,0,43.5,-3.5
9,38.47,47.54,KC,9/9/18,0,LAC,48.5,79.0,4pm,6.33,0,0,1,0,44.5,-4.0


In [289]:
def bayes(line, our_prediction):
    over = [prob[1] for prob in prob_over if prob[0] == line][0]
    under = [prob[1] for prob in prob_under if prob[0] == line][0]
    push = [prob[1] for prob in prob_push if prob[0] == line][0]
    
    pred_o_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
    pred_p_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
    pred_u_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])

    pred_o_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)
    pred_p_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)
    pred_u_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)

    pred_o_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])
    pred_p_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])
    pred_u_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])

    if our_prediction < line:
        joint1 = pred_o_given_u * under
        joint2 = pred_p_given_u * under
        joint3 = pred_u_given_u * under
        normalizer = joint1 + joint2 + joint3
        return (joint3/normalizer)
    elif our_prediction > line:
        joint1 = pred_o_given_o * over
        joint2 = pred_p_given_o * over
        joint3 = pred_u_given_o * over
        normalizer = joint1 + joint2 + joint3
        return (joint1/normalizer)
    elif our_prediction == line:
        joint1 = pred_o_given_p * push
        joint2 = pred_p_given_p * push
        joint3 = pred_u_given_p * push
        normalizer = joint1 + joint2 + joint3
        return (joint2/normalizer)

In [312]:
bayes_probs = []
for index, row in reg_w1.iterrows():
    bayes_probs.append(np.round(bayes(row.over_under, row.prediction),3))
    
reg_w1['prob_correct_pred']=bayes_probs



In [313]:
reg_w1

Unnamed: 0,TPAPG,TPPG,away_id,date,dome,home_id,over_under,temp,time,wind,good_o_good_d,good_o_bad_d,bad_o_good_d,bad_o_bad_d,prediction,pred-actual,prob_correct_pred
0,39.6,52.54,ATL,9/6/18,0,PHI,47.5,83.0,night,7.0,1,0,0,0,47.0,-0.5,0.744
1,44.4,39.2,PIT,9/9/18,0,CLE,48.0,67.0,1pm,7.0,0,0,1,0,42.0,-6.0,0.711
2,40.83,43.73,SF,9/9/18,1,MIN,47.5,72.0,1pm,0.0,0,0,1,0,45.0,-2.5,0.744
3,47.54,33.27,CIN,9/9/18,1,IND,47.5,72.0,1pm,0.0,0,0,1,0,42.0,-5.5,0.744
4,41.0,43.2,BUF,9/9/18,0,BAL,42.0,73.5,1pm,3.5,0,0,1,0,43.5,1.5,0.5
5,42.07,42.33,JAX,9/9/18,0,NYG,44.5,72.0,1pm,7.0,0,0,1,0,43.0,-1.5,0.51
6,43.54,48.54,TB,9/9/18,1,NO,52.5,72.0,1pm,0.0,1,0,0,0,48.0,-4.5,0.833
7,46.93,50.47,HOU,9/9/18,0,NE,51.0,71.67,1pm,6.67,1,0,0,0,48.0,-3.0,0.824
8,47.8,38.94,TEN,9/9/18,0,MIA,47.0,85.0,1pm,9.0,0,0,1,0,43.5,-3.5,0.603
9,38.47,47.54,KC,9/9/18,0,LAC,48.5,79.0,4pm,6.33,0,0,1,0,44.5,-4.0,0.778


In [252]:
over = [prob[1] for prob in prob_over if prob[0] == 47.5][0]
under = [prob[1] for prob in prob_under if prob[0] == 47.5][0]
push = [prob[1] for prob in prob_push if prob[0] == 47.5][0]
over,under,push

(0.4691358024691358, 0.5308641975308642, 0.0)

# Predictions for old games

In [23]:
def past_predictions(df):
    y_preds = []
    for index, row in df.iterrows():

        yt,max_lambda =boxcox(week5_df.over_under_line)
        xt1,x1_lam=boxcox(week5_df.total_points_against)
        xt2,x2_lam=boxcox(week5_df.total_ppg)
        xt3,x3_lam=boxcox(week5_df.weather_temperature+1)
        xt4,x4_lam=boxcox(week5_df.weather_wind_mph+1)
        xt7,xlam7=boxcox(week5_df.schedule_season)

        papg = boxcox1p(row.total_points_against,x1_lam)
        ppg = boxcox1p(row.total_ppg,x2_lam)
        temp = boxcox1p(row.weather_temperature,x3_lam)
        wind = boxcox1p(row.weather_wind_mph,x4_lam)
        dome = row.dome
        season = boxcox1p(row.schedule_season,xlam7)
        
        y = -2.575851e+06 + 6.509747e-02*papg +3.615986e-01*ppg + 1.505438e-04*temp + -2.959061e-02*wind + 1.402209e-01*dome +4.085226e+06*season 
        y_pred = np.round(untransform(y,max_lambda)*2)/2
        y_preds.append(y_pred)
            
    return y_preds

In [25]:
past_preds = past_predictions(week5_df)
past_preds

[43.5,
 56.5,
 37.0,
 40.0,
 34.0,
 41.0,
 37.5,
 46.0,
 51.0,
 45.0,
 51.5,
 35.5,
 45.5,
 48.0,
 38.5,
 34.5,
 46.0,
 49.0,
 40.5,
 54.0,
 49.5,
 36.5,
 48.0,
 50.5,
 42.5,
 46.0,
 45.5,
 39.5,
 37.0,
 37.0,
 40.0,
 39.0,
 50.0,
 47.0,
 51.5,
 54.5,
 48.0,
 47.5,
 48.0,
 42.5,
 42.5,
 42.0,
 39.5,
 52.0,
 34.5,
 41.0,
 38.5,
 39.5,
 56.0,
 49.0,
 45.5,
 47.0,
 42.0,
 39.0,
 42.0,
 40.0,
 40.5,
 41.5,
 49.0,
 43.5,
 43.0,
 42.0,
 39.0,
 46.0,
 35.0,
 47.5,
 41.5,
 43.0,
 41.0,
 47.0,
 39.0,
 38.0,
 42.5,
 43.0,
 45.5,
 41.0,
 42.0,
 44.0,
 41.5,
 42.0,
 50.5,
 43.0,
 42.0,
 46.5,
 52.5,
 41.5,
 45.0,
 40.0,
 38.5,
 53.0,
 41.5,
 50.0,
 36.5,
 47.5,
 40.0,
 43.5,
 41.5,
 35.0,
 46.5,
 45.0,
 43.5,
 42.0,
 38.5,
 43.0,
 35.5,
 41.5,
 37.5,
 42.5,
 46.0,
 50.0,
 45.5,
 43.0,
 45.5,
 42.5,
 41.0,
 43.0,
 35.5,
 46.5,
 45.0,
 39.0,
 37.5,
 43.5,
 40.5,
 44.5,
 45.5,
 47.0,
 45.5,
 38.0,
 42.0,
 43.0,
 42.5,
 46.0,
 41.0,
 44.5,
 37.5,
 38.5,
 46.0,
 41.0,
 45.5,
 47.0,
 43.0,
 40.5,
 46.0,

In [117]:
preds_df = week5_df.drop(columns=['weather_detail', 'weather_temperature', 'weather_wind_mph',
       'weather_humidity', 'score_difference', 'home_id', 'away_id',
       'home_favorite', 'favorite_covered', 'winning_team', 'losing_team',
       'home_wins', 'home_losses', 'home_ties', 'away_wins', 'away_losses',
       'away_ties', 'home_points_for', 'home_points_against',
       'away_points_for', 'away_points_against', 'h_games', 'a_games',
       'home_win_pct', 'away_win_pct', 'win_pct_diff', 'h_ppg', 'h_papg',
       'a_ppg', 'a_papg', 'home_pt_diff_pg', 'away_pt_diff_pg', 'pt_diff_pg',
       'home_exp_win_pct', 'away_exp_win_pct', 'home_win_pct_last_4',
       'away_win_pct_last_4', 'exp_win_difference', 'total_ppg',
       'total_points_against', 'last_four_difference', 'dome','team_favorite_id','spread_favorite'])
preds_df['over_under_pred'] = past_preds
preds_df['pred-actual'] = preds_df.over_under_pred- preds_df.over_under_line

preds_df['good_pred']= np.where((preds_df.over_under_pred>preds_df.over_under_line) & 
                                (preds_df.point_total>preds_df.over_under_line),1,0)
preds_df['good_pred']= np.where((preds_df.over_under_pred<preds_df.over_under_line) & 
                                (preds_df.point_total<preds_df.over_under_line),1,preds_df.good_pred)
preds_df['good_pred']= np.where((preds_df.over_under_pred==preds_df.over_under_line),1,preds_df.good_pred)
preds_df['good_pred']= np.where((preds_df.point_total==preds_df.over_under_line),1,preds_df.good_pred)

preds_df['new_ou_result']=np.where(preds_df.over_under_pred<preds_df.point_total,1,0)
preds_df['new_ou_result']=np.where(preds_df.over_under_pred==preds_df.point_total,2,preds_df.new_ou_result)
preds_df[preds_df.over_under_result==2]

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,over_under_line,score_home,score_away,point_total,over_under_result,over_under_pred,pred-actual,good_pred,new_ou_result
5449,2002-10-13,2002,6,Indianapolis Colts,Baltimore Ravens,42.0,22,20,42,2,40.5,-1.5,1,1
5536,2002-11-24,2002,12,Denver Broncos,Indianapolis Colts,43.0,20,23,43,2,41.5,-1.5,1,1
5783,2003-11-16,2003,11,Carolina Panthers,Washington Redskins,37.0,20,17,37,2,40.0,3.0,1,0
5792,2003-11-16,2003,11,Oakland Raiders,Minnesota Vikings,46.0,28,18,46,2,44.5,-1.5,1,1
5793,2003-11-16,2003,11,Philadelphia Eagles,New York Giants,38.0,28,10,38,2,38.5,0.5,1,0
5970,2004-10-10,2004,5,New England Patriots,Miami Dolphins,34.0,24,10,34,2,35.5,1.5,1,0
6025,2004-11-07,2004,9,Denver Broncos,Houston Texans,44.0,31,13,44,2,40.5,-3.5,1,1
6273,2005-10-24,2005,7,Atlanta Falcons,New York Jets,41.0,27,14,41,2,40.5,-0.5,1,1
6310,2005-11-13,2005,10,New York Giants,Minnesota Vikings,45.0,21,24,45,2,43.0,-2.0,1,1
6612,2006-11-26,2006,12,Tennessee Titans,New York Giants,45.0,24,21,45,2,42.0,-3.0,1,1


In [202]:
print('The model gave good predictions for the over/under in {}% of the games'.format(np.round(np.sum(preds_df.good_pred)/(len(preds_df))*100,2)))
print('\nNote: A prediction is classified as good when our model guides the bettor to a win or push')


The model gave good predictions for the over/under in 54.77% of the games

Note: A prediction is classified as good when our model guides the bettor to a win or push


In [155]:
pred_o_and_o=len(preds_df[(preds_df.over_under_pred>preds_df.over_under_line) & (preds_df.over_under_result==1)])/len(preds_df.over_under_result==1)
pred_u_and_o=len(preds_df[(preds_df.over_under_pred<preds_df.over_under_line) & (preds_df.over_under_result==1)])/len(preds_df.over_under_result==1)
pred_p_and_o=len(preds_df[(preds_df.over_under_pred==preds_df.over_under_line) & (preds_df.over_under_result==1)])/len(preds_df.over_under_result==1)


In [156]:
pred_u_and_u=len(preds_df[(preds_df.over_under_pred<preds_df.over_under_line) & (preds_df.over_under_result==0)])/len(preds_df.over_under_result==0)
pred_o_and_u=len(preds_df[(preds_df.over_under_pred>preds_df.over_under_line) & (preds_df.over_under_result==0)])/len(preds_df.over_under_result==0)
pred_p_and_u=len(preds_df[(preds_df.over_under_pred==preds_df.over_under_line) & (preds_df.over_under_result==0)])/len(preds_df.over_under_result==0)

In [157]:
pred_o_and_p=len(preds_df[(preds_df.over_under_pred>preds_df.over_under_line) & (preds_df.over_under_result==2)])/len(preds_df.over_under_result==2)
pred_u_and_p=len(preds_df[(preds_df.over_under_pred<preds_df.over_under_line) & (preds_df.over_under_result==2)])/len(preds_df.over_under_result==2)
pred_p_and_p=len(preds_df[(preds_df.over_under_pred==preds_df.over_under_line) & (preds_df.over_under_result==2)])/len(preds_df.over_under_result==2)

In [197]:
pred_p_and_p

0.001751927119831815

# Predicting O/U for all week 5-16 games since 1979

In [122]:
all_df = df[(df.schedule_week > 4) & (df.schedule_week < 17)]

In [123]:
all_preds = past_predictions(all_df)
all_preds

[40.5,
 39.5,
 36.5,
 38.5,
 42.0,
 41.5,
 35.5,
 41.5,
 41.0,
 35.5,
 35.5,
 42.5,
 40.0,
 37.0,
 36.5,
 40.0,
 39.5,
 36.5,
 36.0,
 38.0,
 39.5,
 37.5,
 39.0,
 40.5,
 35.0,
 34.5,
 39.5,
 36.5,
 35.0,
 35.0,
 39.0,
 39.5,
 38.0,
 35.5,
 31.0,
 38.0,
 36.5,
 35.5,
 40.0,
 34.5,
 41.0,
 41.0,
 35.5,
 38.5,
 38.0,
 33.0,
 35.5,
 32.5,
 36.5,
 44.0,
 39.5,
 41.0,
 42.5,
 36.5,
 36.0,
 36.5,
 38.0,
 35.5,
 36.5,
 32.0,
 39.0,
 43.5,
 35.5,
 33.5,
 37.5,
 40.5,
 36.5,
 38.0,
 39.0,
 40.0,
 38.0,
 36.5,
 39.0,
 34.5,
 36.5,
 37.0,
 35.5,
 36.0,
 41.0,
 38.5,
 36.5,
 41.0,
 36.0,
 38.0,
 35.0,
 40.5,
 40.5,
 35.0,
 37.5,
 34.5,
 41.5,
 35.5,
 34.5,
 42.0,
 37.0,
 39.5,
 35.5,
 36.5,
 34.5,
 38.5,
 37.0,
 43.0,
 35.0,
 37.5,
 35.5,
 37.5,
 40.0,
 36.5,
 42.5,
 34.5,
 36.5,
 37.0,
 39.5,
 37.0,
 39.5,
 35.0,
 40.0,
 37.0,
 34.5,
 39.0,
 35.0,
 41.5,
 36.5,
 39.0,
 35.0,
 43.5,
 38.0,
 34.0,
 42.5,
 36.0,
 36.5,
 36.0,
 38.0,
 34.5,
 42.0,
 39.5,
 40.0,
 35.0,
 33.5,
 42.0,
 37.5,
 40.0,
 37.5,

In [164]:
all_games_df = all_df.drop(columns=['weather_detail', 'weather_temperature', 'weather_wind_mph',
       'weather_humidity', 'score_difference', 'home_id', 'away_id',
       'home_favorite', 'favorite_covered', 'winning_team', 'losing_team',
       'home_wins', 'home_losses', 'home_ties', 'away_wins', 'away_losses',
       'away_ties', 'home_points_for', 'home_points_against',
       'away_points_for', 'away_points_against', 'h_games', 'a_games',
       'home_win_pct', 'away_win_pct', 'win_pct_diff', 'h_ppg', 'h_papg',
       'a_ppg', 'a_papg', 'home_pt_diff_pg', 'away_pt_diff_pg', 'pt_diff_pg',
       'home_exp_win_pct', 'away_exp_win_pct', 'home_win_pct_last_4',
       'away_win_pct_last_4', 'exp_win_difference', 'total_ppg',
       'total_points_against', 'last_four_difference', 'dome','team_favorite_id','spread_favorite'])
all_games_df['over_under_pred'] = all_preds
all_games_df['pred-actual'] = all_games_df.over_under_pred- all_games_df.over_under_line

all_games_df['good_pred']= np.where((all_games_df.over_under_pred>all_games_df.over_under_line) & 
                                (all_games_df.point_total>all_games_df.over_under_line),1,0)
all_games_df['good_pred']= np.where((all_games_df.over_under_pred<all_games_df.over_under_line) & 
                                (all_games_df.point_total<all_games_df.over_under_line),1,all_games_df.good_pred)

all_games_df['good_pred']= np.where((all_games_df.over_under_pred==all_games_df.over_under_line),1,all_games_df.good_pred)
all_games_df['good_pred']= np.where((all_games_df.point_total==all_games_df.over_under_line),1,all_games_df.good_pred)

all_games_df['new_ou_result']=np.where(all_games_df.over_under_pred<all_games_df.point_total,1,0)
all_games_df['new_ou_result']=np.where(all_games_df.over_under_pred==all_games_df.point_total,2,all_games_df.new_ou_result)
all_games_df[all_games_df.over_under_result==2]

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,over_under_line,score_home,score_away,point_total,over_under_result,over_under_pred,pred-actual,good_pred,new_ou_result
90,1979-10-14,1979,7,Kansas City Chiefs,Denver Broncos,34.0,10,24,34,2,31.0,-3.0,1,1
165,1979-11-18,1979,12,Tampa Bay Buccaneers,New York Giants,34.0,31,3,34,2,34.5,0.5,1,0
170,1979-11-25,1979,13,Atlanta Falcons,New Orleans Saints,43.0,6,37,43,2,39.5,-3.5,1,1
208,1979-12-09,1979,15,Washington Redskins,Cincinnati Bengals,42.0,28,14,42,2,40.0,-2.0,1,1
321,1980-10-19,1980,7,Denver Broncos,Kansas City Chiefs,40.0,17,23,40,2,37.0,-3.0,1,1
322,1980-10-19,1980,7,Houston Oilers,Tampa Bay Buccaneers,34.0,20,14,34,2,37.5,3.5,1,0
393,1980-11-23,1980,12,Minnesota Vikings,Green Bay Packers,38.0,13,25,38,2,35.5,-2.5,1,1
431,1980-12-14,1980,15,Atlanta Falcons,San Francisco 49ers,45.0,35,10,45,2,41.0,-4.0,1,1
454,1980-12-21,1980,16,Seattle Seahawks,Denver Broncos,42.0,17,25,42,2,40.5,-1.5,1,1
455,1980-12-21,1980,16,St. Louis Cardinals,Washington Redskins,38.0,7,31,38,2,35.5,-2.5,1,1


In [201]:
print('The model gave good predictions for the over/under in {}% of the games'.format(np.round(np.sum(all_games_df.good_pred)/(len(all_games_df))*100,2)))
print('\nNote: A prediction is classified as good when our model guides the bettor to a win or push')


The model gave good predictions for the over/under in 54.79% of the games

Note: A prediction is classified as good when our model guides the bettor to a win or push


In [295]:
line=47.5
pred_o_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
pred_p_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
pred_u_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])

pred_o_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)
pred_p_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)
pred_u_given_p = len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/(len(preds_df[(preds_df.over_under_result==2) & (preds_df.over_under_line==line)])+1)

pred_o_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])
pred_p_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])
pred_u_given_u = len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==0) & (preds_df.over_under_line==line)])

cond_probs = [pred_o_given_o,pred_p_given_o,pred_u_given_o, pred_o_given_p,pred_p_given_p,pred_u_given_p,
              pred_o_given_u,pred_p_given_u,pred_u_given_u]

In [296]:
pred_o_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred>preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
pred_p_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred==preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])
pred_u_given_o = len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line) & (preds_df.over_under_pred<preds_df.over_under_line)])/len(preds_df[(preds_df.over_under_result==1) & (preds_df.over_under_line==line)])


pred_o_given_o + pred_p_given_o +pred_u_given_o



1.0

# Pivot table % cover given line

In [170]:
o_u_pivot = week5_df.pivot_table(index='over_under_line', columns='over_under_result',
                    aggfunc={'over_under_result':len}, fill_value = 0)
o_u_pivot['row_total'] = 0
for index, row in o_u_pivot.iterrows():
    row.row_total = sum(row)
ou_covered = o_u_pivot.over_under_result[1]
ou_no_cover = o_u_pivot.over_under_result[0]
ou_push = o_u_pivot.over_under_result[2]
lines = sorted(set(week5_df.over_under_line))
x_lines = np.array([spread for spread in lines])
y_over = [ou_covered[value]/o_u_pivot['row_total'][value] for value in x_lines]
y_under = [ou_no_cover[value]/o_u_pivot['row_total'][value] for value in x_lines]
y_neither = [ou_push[value]/o_u_pivot['row_total'][value] for value in x_lines]
prob_over = list(zip(x_lines,y_over))
prob_under = list(zip(x_lines,y_under))
prob_push = list(zip(x_lines,y_neither))