In [None]:
import pandas as pd
import functions as func
import pickle
import xgboost as xgb
from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae



In [36]:
api_url = 'https://gamesheetstats.com/api/useScoredGames/getSeasonScores/6642?filter[limit]=10000'
df_season1 = func.parse_rest_api_to_pandas(api_url)

df_season1 = df_season1[['date','game.number','game.type','game.gameId','game.homeTeam.id','game.homeTeam.name','game.homeTeam.division','game.visitorTeam.id','game.visitorTeam.name','game.visitorTeam.division','game.finalScore.homeGoals','game.finalScore.visitorGoals']]
df_season1 = df_season1.rename(columns={'game.number':'game_number','game.type':'game_type','game.gameId':'game_id','game.homeTeam.id':'hometeam_id','game.homeTeam.name':'hometeam_name','game.homeTeam.division':'hometeam_division','game.visitorTeam.id':'visitorteam_id','game.visitorTeam.name':'visitorteam_name','game.visitorTeam.division':'visitorteam_division','game.finalScore.homeGoals':'homegoals','game.finalScore.visitorGoals':'visitorgoals'})
df_season1

Unnamed: 0,date,game_number,game_type,game_id,hometeam_id,hometeam_name,hometeam_division,visitorteam_id,visitorteam_name,visitorteam_division,homegoals,visitorgoals
0,"Sat, Feb 15, 2025",,Regular Season,2248963,248315,Cyclones Academy 14U Tier 1,14U Tier 1,248128,Seacoast Performance Academy Spartans 14U,14U Tier 1,0,8
1,"Sat, Feb 15, 2025",6950204,Regular Season,2238756,251274,Northern Cyclones Squirt Minor Gold,10U Tier 4,248303,Keene 10U White,10U Tier 4,6,4
2,"Sat, Feb 15, 2025",6703687,Regular Season,1941510,248474,Manchester 12U Major II - Engvik,12U Tier 3,248475,Manchester 12U Minor I - Hawkes,12U Tier 3,9,3
3,"Sat, Feb 15, 2025",6749225,Regular Season,2244541,248654,Concord 11U Elite - Poirier,12U Tier 4,250808,Granite State Wild PW Minor 11U,12U Tier 4,1,9
4,"Sat, Feb 15, 2025",6694611,Regular Season,1941221,251301,Upper Valley 10u Blue,10U Tier 3,248656,Berlin 10U Paiva,10U Tier 3,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
972,"Sat, Sep 7, 2024",6674448,Regular Season,1907924,248481,Manchester 18U - Baker,18U Tier 2,251195,NH Mountain Kings 18U AAA Full Season,18U Tier 2,2,2
973,"Sat, Sep 7, 2024",6676288,Regular Season,1907923,250620,Granite State Wild 18U Midget (Strum),18U Tier 4,250602,Back Bay Indians 18U,18U Tier 4,3,5
974,"Sat, Sep 7, 2024",6676268,Regular Season,1907921,250622,Granite State Wild Midget 18U (Behan),18U Tier 2,250819,NH Mountain Kings U18-Split,18U Tier 2,2,4
975,"Sat, Sep 7, 2024",6675036,Regular Season,1907856,248479,Manchester 16U - Bernard,16U Tier 2,250655,NH Avalanche 16U Tier 1,16U Tier 2,6,3


In [38]:
# Reverse home & visitor team metrics to get a full dataset
df_season2 = df_season1.rename(columns={'hometeam_id':'visitorteam_id','hometeam_name':'visitorteam_name','hometeam_division':'visitorteam_division','visitorteam_id':'hometeam_id','visitorteam_name':'hometeam_name','visitorteam_division':'hometeam_division','homegoals':'visitorgoals','visitorgoals':'homegoals'})
df_season = pd.concat([df_season1, df_season2], ignore_index=True, axis=0, join="inner")

df_season['goal_diff'] = df_season['homegoals'] - df_season['visitorgoals']

for col in df_season.columns.to_list():
    if col in ('homegoals','visitorgoals','goal_diff'):
        df_season[col] = pd.to_numeric(df_season[col])
    elif col =='date':
        df_season[col] = pd.to_datetime(df_season[col])
    else:
        df_season[col] = df_season[col].astype('category')
        # try:
            # df_season[col] = df_season[col].apply(pd.to_numeric)
        # except:
        # df_season[col] = df_season[col].astype('category')
df_season.dtypes

date                    datetime64[ns]
game_number                   category
game_type                     category
game_id                       category
hometeam_id                   category
hometeam_name                 category
hometeam_division             category
visitorteam_id                category
visitorteam_name              category
visitorteam_division          category
homegoals                        int64
visitorgoals                     int64
goal_diff                        int64
dtype: object

In [None]:
# Build GLM framework
glm_forumula = (''' goal_diff ~ hometeam_id + visitorteam_id ''')

y, X = dmatrices(glm_forumula, data=df_season, return_type='dataframe')

X

Unnamed: 0,Intercept,hometeam_id[T.248126],hometeam_id[T.248127],hometeam_id[T.248128],hometeam_id[T.248129],hometeam_id[T.248142],hometeam_id[T.248143],hometeam_id[T.248144],hometeam_id[T.248145],hometeam_id[T.248146],...,visitorteam_id[T.251348],visitorteam_id[T.251349],visitorteam_id[T.251350],visitorteam_id[T.251351],visitorteam_id[T.251352],visitorteam_id[T.251361],visitorteam_id[T.251409],visitorteam_id[T.251410],visitorteam_id[T.251411],visitorteam_id[T.272534]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1949,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1951,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1952,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# GLM v1
glm = LinearRegression()
glm.fit(X, y)

pred = pd.DataFrame(glm.predict(X), columns=['pred'])
out = pd.concat([df_season, pred], axis=1)

print('R2: ', round(r2_score(out['goal_diff'], out['pred']), 4))
print('MAE: ', round(mae(out['goal_diff'], out['pred']), 4))

R2:  0.6995
MAE:  2.1667
MAPE:  626476291563277.6
