In [107]:
import sys
sys.path.append('../')
import pandas as pd
import joblib
from helper_files.etv_model import ETVModel
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm


In [2]:
models = joblib.load('../data/models/all_models.jblb')
data = joblib.load('../data/processed/data_1003.jblb')['df']
etv_model = models['etv_model']
data[['cp', 'fv', 'etv']] = pd.DataFrame(etv_model.predict(data, etv_only=False)).T.values

throw_counts = data.groupby(['thrower'])['etv'].count().reset_index(name='total_throws')
data = data[data['thrower'].isin(throw_counts[throw_counts['total_throws'] > 100]['thrower'])]

In [100]:
groups_assists = ['thrower', 'year']
agg_dict_assists = {
    'etv': 'sum',
    'goal': lambda x: (x == True).sum(),
    'hockey_assist': lambda x: (x == True).sum(),
    'turnover': 'count'
}
df_assists = data.groupby(groups_assists).agg(agg_dict_assists).rename({'goal': 'assists'}, axis=1)
df_assists.index.names = ['player', 'year']
groups_goals = ['receiver', 'year'] 
agg_dict_goals = {
    'goal': lambda x: (x == True).sum(),
}
df_goals = data.groupby(groups_goals).agg(agg_dict_goals).rename({'goal': 'goals'}, axis=1)
df_goals.index.names = ['player', 'year']
player_season_stats = pd.merge(df_assists.reset_index(), df_goals.reset_index(), how='inner', on=['player', 'year'])
player_season_stats


Unnamed: 0,player,year,etv,assists,hockey_assist,turnover,goals
0,aarnstein,2023,14.794841,0,3,23,3
1,aarnstein,2024,85.189461,3,5,131,8
2,aarsenaul1,2021,38.005924,5,5,67,8
3,aarsenaul1,2022,137.546555,20,22,213,32
4,aatkins,2022,137.337967,16,16,221,10
...,...,...,...,...,...,...,...
1743,zschakner,2024,23.054417,2,1,39,2
1744,zslayton,2021,81.223465,6,12,129,15
1745,zslayton,2022,25.338728,0,4,39,8
1746,zslayton,2023,38.912949,2,2,66,5


In [99]:
groups_assists = ['thrower', 'gameID']
agg_dict_assists = {
    'etv': 'sum',
    'goal': lambda x: (x == True).sum(),
    'hockey_assist': lambda x: (x == True).sum(),
    'turnover': 'count'
}
df_assists = data.groupby(groups_assists).agg(agg_dict_assists).rename({'goal': 'assists'}, axis=1)
df_assists.index.names = ['player', 'game']
groups_goals = ['receiver', 'gameID'] 
agg_dict_goals = {
    'goal': lambda x: (x == True).sum(),
}
df_goals = data.groupby(groups_goals).agg(agg_dict_goals).rename({'goal': 'goals'}, axis=1)
df_goals.index.names = ['player', 'game']
player_game_stats = pd.merge(df_assists.reset_index(), df_goals.reset_index(), how='inner', on=['player', 'game'])
player_game_stats


Unnamed: 0,player,game,etv,assists,hockey_assist,turnover,goals
0,aarnstein,2023-06-09-SLC-COL,0.525732,0,0,1,0
1,aarnstein,2023-06-24-MIN-COL,3.817878,0,0,6,1
2,aarnstein,2023-06-30-COL-POR,2.989465,0,1,5,0
3,aarnstein,2023-07-01-COL-SEA,1.326910,0,0,2,0
4,aarnstein,2023-07-14-NY-COL,2.111720,0,1,3,0
...,...,...,...,...,...,...,...
14910,zthoreson,2024-06-21-SD-COL,28.578606,3,4,43,2
14911,zthoreson,2024-06-29-COL-SEA,15.477620,0,2,24,1
14912,zthoreson,2024-06-30-COL-POR,23.791418,5,1,40,1
14913,zthoreson,2024-07-06-IND-COL,12.709991,1,0,18,0


In [106]:
player_season_stats['etv_lag1_year'] = player_season_stats.groupby('player')['etv'].shift(1)
player_season_stats = player_season_stats.dropna(subset=['etv_lag1_year'])

# Perform linear regression of ETV as response and ETV lag1 as predictor
X = player_season_stats['etv_lag1_year']
y = player_season_stats['etv']
X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                    etv   R-squared:                       0.424
Model:                            OLS   Adj. R-squared:                  0.423
Method:                 Least Squares   F-statistic:                     405.4
Date:                Thu, 03 Oct 2024   Prob (F-statistic):           5.81e-68
Time:                        15:36:26   Log-Likelihood:                -3076.9
No. Observations:                 552   AIC:                             6158.
Df Residuals:                     550   BIC:                             6166.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            34.7958      4.164      8.356

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_season_stats['etv_lag1_year'] = player_season_stats.groupby('player')['etv'].shift(1)


In [108]:
# Mixed-effects model can handle unbalanced data
model = mixedlm("etv ~ year", player_season_stats, groups=player_season_stats['player'])
result = model.fit()

# Print the summary
print(result.summary())

            Mixed Linear Model Regression Results
Model:              MixedLM   Dependent Variable:   etv       
No. Observations:   552       Method:               REML      
No. Groups:         370       Scale:                2619.4483 
Min. group size:    1         Log-Likelihood:       -3173.0684
Max. group size:    2         Converged:            Yes       
Mean group size:    1.5                                       
--------------------------------------------------------------
            Coef.   Std.Err.   z    P>|z|   [0.025     0.975] 
--------------------------------------------------------------
Intercept -1486.753 9964.654 -0.149 0.881 -21017.116 18043.610
year          0.782    4.924  0.159 0.874     -8.870    10.433
Group Var  4288.287   14.477                                  



  dat = dat.applymap(lambda x: _formatter(x, float_format))
