In [30]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, ridge_regression, Lasso, LinearRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, accuracy_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, r_regression
from sympy import Point
import os, shutil
from scipy.stats import percentileofscore
import math
from cubist import Cubist
from sqlalchemy import create_engine
np.warnings = warnings
import import_ipynb


%store -r transformed_defence_data
%store -r evenStrengthD_names
%store -r transformed_ppD_data
%store -r powerPlayD_names
%store -r transformed_pkD_data
%store -r penaltyKillD_names
%store -r transformed_forward_data
%store -r evenStrengthF_names
%store -r transformed_ppF_data
%store -r powerPlayF_names
%store -r transformed_pkF_data
%store -r penaltyKillF_names

%store -r evenStrengthF
%store -r powerPlayF
%store -r penaltyKillF
%store -r evenStrengthD
%store -r penaltyKillD
%store -r powerPlayD

%store -r evenStrengthG
%store -r powerPlayG
%store -r penaltyKillG

offensive_stats = transformed_defence_data.filter(regex="I_F").columns
OnIce = transformed_defence_data.filter(regex="OnIce_F").columns
offensive_stats = offensive_stats.append(OnIce)
offensive_stats = offensive_stats.append(transformed_forward_data.filter(items=["finishing"]).columns)
offensive_stats = offensive_stats.drop(["I_F_takeaways", "I_F_hits", "I_F_dZoneGiveaways", 'I_F_penalityMinutes',
                                         'I_F_freeze', 'I_F_xPlayStopped', "I_F_playStopped", "I_F_xFreeze"])

defensive_other = transformed_defence_data.filter(items=["I_F_takeaways", "I_F_dZoneGiveaways", "shotsBlockedByPlayer"]).columns
defensive_stats = transformed_defence_data.filter(regex="OnIce_A").columns
defensive_stats = defensive_stats.append(defensive_other)

#involve both offensive and defensive stats
general_stats = transformed_defence_data.filter(items=["onIce_xGoalsPercentage", "onIce_corsiPercentage", 
                                                       "onIce_fenwickPercentage", "gameScore"]).columns
#both_stats = ["onIce_xGoalsPercentage", "onIce_corsiPercentage", "onIce_fenwickPercentage"]

penalty_stats_negative = ["penalties", "I_F_penalityMinutes"]
penalty_stats_positive =  ["penalityMinutesDrawn", "penaltiesDrawn"]
penalty_stats = ["penalties", "I_F_penalityMinutes", "penalityMinutesDrawn", "penaltiesDrawn"]

hits = ["I_F_hits"]

# 'I_F_oZoneShiftStarts', 'I_F_dZoneShiftStarts',
#        'I_F_neutralZoneShiftStarts', 'I_F_flyShiftStarts',
#        'I_F_oZoneShiftEnds', 'I_F_dZoneShiftEnds', 'I_F_neutralZoneShiftEnds',
#        'I_F_flyShiftEnds', 'I_F_shifts', 


In [31]:
def transformer(df):
    pt = PowerTransformer()
    pt.set_output(transform='pandas')
    new_df = pt.fit_transform(df)

    return new_df

In [32]:
def get_percentile_rank(df):
    return df.rank(pct=True).mul(100)

Stats we care about:
1. Individual Goals: I_F_goals (EV) + I_F_goals (PP) + I_F_goals (PK)
2. Individual Primary Assists: I_F_primaryAssists (EV) + I_F_primaryAssists (PP) + I_F_primaryAssists (PK)
3. Penalty Differential
4. Even Strength xGoals for %: onIce_xGoalsPercentage (EV)
5. Even strength chances given up: OnIce_A_xGoals (EV)
6. Powerplay Chances: (I_F_xGoals (PP) + OnIce_F_xGoals (PP))
7. Penalty Kill Chances given up: Onice_A_xGoals (PK)
8. Powerplay +/-: OnIce_F_goals (PP) - OnIce_A_goals (PP)
9. Penalty Kill +/-: OnIce_F_goals (PK) - OnIce_A_goals (PK)
10. EV +/-: OnIce_F_goals (EV) - OnIce_A_goals (EV)
11. Finishing: finishing (goals - xGoals)
12. Physicality: I_F_hits

In [33]:
def calculate_important_stats(evenStrength, powerPlay, penaltyKill):
    new_df = pd.DataFrame()

    evenStrength = evenStrength.loc[evenStrength.icetime > 5000]
    powerPlay = powerPlay.loc[powerPlay.icetime > 3000]
    penaltyKill = penaltyKill.loc[penaltyKill.icetime > 3000]

    EV_name, EV_data = evenStrength.iloc[:, :5], evenStrength.iloc[:,6:]
    PP_name, PP_data = powerPlay.iloc[:, :5], powerPlay.iloc[:,6:]
    PK_name, PK_data = penaltyKill.iloc[:, :5], penaltyKill.iloc[:,6:]
    EV_data = EV_data.add_prefix("EV_")

    team_rank = evenStrength["team_rank"]
    team_rank.reset_index(drop=True, inplace=True)

    cap_hit = evenStrength["cap_hit"]
    cap_hit.reset_index(drop=True, inplace=True)
    
    PP_data = PP_data.add_prefix("PP_")
    PK_data = PK_data.add_prefix("PK_")

    EV_data.iloc[:, 9:] = EV_data.iloc[:, 9:].div(EV_data.EV_icetime, axis=0)
    #EV_data.iloc[:, 1] = EV_data.iloc[:, 1].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:, 9:] = PP_data.iloc[:, 9:].div(PP_data.PP_icetime, axis=0)
    #PP_data.iloc[:, 1] = PP_data.iloc[:, 1].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:, 9:] = PK_data.iloc[:, 9:].div(PK_data.PK_icetime, axis=0)
    #PK_data.iloc[:, 1] = PK_data.iloc[:, 1].div(PK_data.PK_icetime, axis=0)

    EV_avg = EV_data.mean().drop('EV_icetime')
    PP_avg = PP_data.mean().drop("PP_icetime")
    PK_avg = PK_data.mean().drop("PK_icetime")


    EV_data.iloc[:,1:] = (EV_data.iloc[:,1:] - EV_avg).mul(EV_data.EV_icetime, axis=0)
    EV_data.iloc[:,1:9] = EV_data.iloc[:, 1:9].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:,1:] = (PP_data.iloc[:,1:] - PP_avg).mul(PP_data.PP_icetime, axis=0)
    PP_data.iloc[:,1:9] = PP_data.iloc[:, 1:9].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:,1:] = (PK_data.iloc[:,1:] - PK_avg).mul(PK_data.PK_icetime, axis=0)
    PK_data.iloc[:,1:9] = PK_data.iloc[:, 1:9].div(PK_data.PK_icetime, axis=0)


    evenStrength = pd.concat([EV_name, EV_data], axis=1)
    powerPlay = pd.concat([PP_name, PP_data], axis=1)
    penaltyKill = pd.concat([PK_name, PK_data], axis=1)

    full_df = pd.merge(evenStrength, powerPlay, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    full_df = pd.merge(full_df, penaltyKill, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    
    full_df.fillna(0, inplace=True)

    #print(full_df)

    new_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']] = full_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']]
    new_df["team_rank"] = team_rank
    new_df["cap_hit"] = cap_hit

    # new_df["EV_IG"] = full_df["EV_I_F_goals"]
    # new_df["PP_IG"] = full_df["PP_I_F_goals"]
    # new_df["PK_IG"] = full_df["PK_I_F_goals"]

    # 1. Calculate IG
    new_df["Individual_Goals"] = full_df["EV_I_F_goals"] + full_df["PP_I_F_goals"] + full_df["PK_I_F_goals"]

    # 2. Calculate Individual primary assists
    new_df["Individual_primaryAssists"] = full_df["EV_I_F_primaryAssists"] + full_df["PP_I_F_primaryAssists"] + full_df["PK_I_F_primaryAssists"]

    # 3. Secondary Assists above Average
    new_df["Individual_secondaryAssists"] = full_df["EV_I_F_secondaryAssists"] + full_df["PP_I_F_secondaryAssists"] + full_df["PK_I_F_secondaryAssists"]

    new_df["Production"] = new_df["Individual_Goals"] + new_df["Individual_primaryAssists"] + new_df["Individual_secondaryAssists"]

    # 3. Calculate Penalty Differential
    new_df["Penalty_Differential"] = ((full_df["EV_penaltiesDrawn"] + full_df["PP_penaltiesDrawn"] + full_df["PK_penaltiesDrawn"]) -
                                      (full_df["EV_penalties"] + full_df["PP_penalties"] + full_df["PK_penalties"])  )

    # 4. Even Strength xGoals for %: onIce_xGoalsPercentage (EV) scaled by 100
    new_df["EV_xGoalsPercentage"] = full_df["EV_onIce_xGoalsPercentage"]

    # 5. Even strength chances given up: OnIce_A_xGoals (EV)
    new_df["EV_xGoals_Against"] = -1* (full_df["EV_OnIce_A_xGoals"] + full_df["EV_OnIce_A_flurryAdjustedxGoals"] + full_df["EV_OnIce_A_scoreVenueAdjustedxGoals"]) / 3

    # 6. Powerplay Chances: (I_F_xGoals (PP) + OnIce_F_xGoals (PP)) - give a boost if you are getting you own chances
    new_df["PP_Chances"] = full_df["PP_I_F_xGoals"] + full_df["PP_OnIce_F_xGoals"]

    new_df["EV_Chances"] = full_df["EV_I_F_xGoals"] + full_df["EV_OnIce_F_xGoals"]

    # 7. Penalty Kill Chances given up: Onice_A_xGoals (PK)
    new_df["PK_xGoals_Against"] = -1* (full_df["PK_OnIce_A_xGoals"] + full_df["PK_OnIce_A_flurryAdjustedxGoals"] + full_df["PK_OnIce_A_scoreVenueAdjustedxGoals"]) / 3

    # 8. Powerplay expected +/-: OnIce_F_goals (PP) - OnIce_A_goals (PP)
    new_df["PP_differential"] = full_df["PP_OnIce_F_goals"] - full_df["PP_OnIce_A_goals"]

    # 9. Penalty Kill expected +/-: OnIce_F_goals (PK) - OnIce_A_goals (PK)
    new_df["PK_differential"] = full_df["PK_OnIce_F_goals"] - full_df["PK_OnIce_A_goals"]

    # 10. EV +/-: OnIce_F_goals (EV) - OnIce_A_goals (EV)
    new_df["EV_differential"] = full_df["EV_OnIce_F_goals"] - full_df["EV_OnIce_A_goals"]

    # 11. Finishing: finishing (goals - xGoals)
    new_df["Finishing"] = ((full_df["EV_I_F_goals"] + full_df["PP_I_F_goals"] + full_df["PK_I_F_goals"]) - 
                           (full_df["EV_I_F_xGoals"] + full_df["PP_I_F_xGoals"] + full_df["PK_I_F_xGoals"]))
    # 12. Physicality: I_F_hits
    new_df["Physicality"] = full_df["EV_I_F_hits"] + full_df["PP_I_F_hits"] + full_df["PK_I_F_hits"]

    new_df["Defensive_Actions"] = (full_df["EV_shotsBlockedByPlayer"] + full_df["PK_shotsBlockedByPlayer"] + full_df["PP_shotsBlockedByPlayer"]
                                    + full_df["EV_I_F_takeaways"] + full_df["PP_I_F_takeaways"] + full_df["PK_I_F_takeaways"])
   
    new_df["EV_Defensive_Impact"] = (full_df["EV_OnIce_A_xGoals"] - full_df["EV_OffIce_A_xGoals"])

    # 13. EV High Danger Chances Against
    new_df["EV_HighDangerAgainst"] = -1* (full_df["EV_OnIce_A_highDangerxGoals"] + full_df["EV_OnIce_A_highDangerShots"])
    # 14. PK High Danger Chances Against
    new_df["PK_HighDangerAgainst"] = -1* (full_df["PK_OnIce_A_highDangerxGoals"] + full_df["PK_OnIce_A_highDangerShots"])
    # 15. High Danger chances for
    new_df["High_Danger_Chances_For"] = (full_df["EV_I_F_highDangerxGoals"] + full_df["PP_I_F_highDangerxGoals"] + full_df["PK_I_F_highDangerxGoals"]
                                         + full_df["EV_I_F_highDangerShots"] + full_df["PP_I_F_highDangerShots"] + full_df["PK_I_F_highDangerShots"])

    new_df["GameScore"] = full_df["EV_gameScore"] + full_df["PP_gameScore"] + full_df["PK_gameScore"]


    return new_df


modelF = calculate_important_stats(evenStrengthF, powerPlayF, penaltyKillF)
modelD = calculate_important_stats(evenStrengthD, powerPlayD, penaltyKillD)

In [34]:
def calculate_goalie_stats(evenStrength, powerPlay, penaltyKill):
    new_df = pd.DataFrame()

    evenStrength = evenStrength.loc[evenStrength.icetime > 5000]
    powerPlay = powerPlay.loc[powerPlay.icetime > 3000]
    penaltyKill = penaltyKill.loc[penaltyKill.icetime > 3000]
    
    evenStrength["GSAE"] = evenStrength["xGoals"] - evenStrength["goals"]
    evenStrength["Low_Danger"] = evenStrength["lowDangerxGoals"] - evenStrength["lowDangerGoals"]
    evenStrength["Medium_Danger"] = evenStrength["mediumDangerxGoals"] - evenStrength["mediumDangerGoals"]
    evenStrength["High_Danger"] = evenStrength["highDangerxGoals"] - evenStrength["highDangerGoals"]

    powerPlay["GSAE"] = powerPlay["xGoals"] - powerPlay["goals"]
    powerPlay["Low_Danger"] = powerPlay["lowDangerxGoals"] - powerPlay["lowDangerGoals"]
    powerPlay["Medium_Danger"] = powerPlay["mediumDangerxGoals"] - powerPlay["mediumDangerGoals"]
    powerPlay["High_Danger"] = powerPlay["highDangerxGoals"] - powerPlay["highDangerGoals"]

    penaltyKill["GSAE"] = penaltyKill["xGoals"] - penaltyKill["goals"]
    penaltyKill["Low_Danger"] = penaltyKill["lowDangerxGoals"] - penaltyKill["lowDangerGoals"]
    penaltyKill["Medium_Danger"] = penaltyKill["mediumDangerxGoals"] - penaltyKill["mediumDangerGoals"]
    penaltyKill["High_Danger"] = penaltyKill["highDangerxGoals"] - penaltyKill["highDangerGoals"]

    EV_name, EV_data = evenStrength.iloc[:, :5], evenStrength.iloc[:,6:]
    PP_name, PP_data = powerPlay.iloc[:, :5], powerPlay.iloc[:,6:]
    PK_name, PK_data = penaltyKill.iloc[:, :5], penaltyKill.iloc[:,6:]
    EV_data = EV_data.add_prefix("EV_")

    team_rank = evenStrength["team_rank"]
    team_rank.reset_index(drop=True, inplace=True)

    cap_hit = evenStrength["cap_hit"]
    cap_hit.reset_index(drop=True, inplace=True)
    
    PP_data = PP_data.add_prefix("PP_")
    PK_data = PK_data.add_prefix("PK_")

    EV_data.iloc[:, 1:] = EV_data.iloc[:, 1:].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:, 1:] = PP_data.iloc[:, 1:].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:, 1:] = PK_data.iloc[:, 1:].div(PK_data.PK_icetime, axis=0)

    EV_avg = EV_data.mean().drop('EV_icetime')
    PP_avg = PP_data.mean().drop("PP_icetime")
    PK_avg = PK_data.mean().drop("PK_icetime")


    EV_data.iloc[:,1:] = (EV_data.iloc[:,1:] - EV_avg).mul(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:,1:] = (PP_data.iloc[:,1:] - PP_avg).mul(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:,1:] = (PK_data.iloc[:,1:] - PK_avg).mul(PK_data.PK_icetime, axis=0)

    evenStrength = pd.concat([EV_name, EV_data], axis=1)
    powerPlay = pd.concat([PP_name, PP_data], axis=1)
    penaltyKill = pd.concat([PK_name, PK_data], axis=1)

    full_df = pd.merge(evenStrength, powerPlay, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    full_df = pd.merge(full_df, penaltyKill, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    
    full_df.fillna(0, inplace=True)

    new_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']] = full_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']]
    new_df["team_rank"] = team_rank
    new_df["cap_hit"] = cap_hit

    new_df["GSAE"] = full_df["EV_GSAE"] + full_df["PP_GSAE"] + full_df["PK_GSAE"]
    new_df["Rebound_Control"] = full_df["EV_rebounds"] + full_df["PP_rebounds"] + full_df["PK_rebounds"]
    new_df["Low_Danger"] = full_df["EV_Low_Danger"] + full_df["PP_Low_Danger"] + full_df["PK_Low_Danger"]
    new_df["Medium_Danger"] = full_df["EV_Medium_Danger"] + full_df["PP_Medium_Danger"] + full_df["PK_Medium_Danger"]
    new_df["High_Danger"] = full_df["EV_High_Danger"] + full_df["PP_High_Danger"] + full_df["PK_High_Danger"]

    return new_df

modelG = calculate_goalie_stats(evenStrengthG, powerPlayG, penaltyKillG)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evenStrength["GSAE"] = evenStrength["xGoals"] - evenStrength["goals"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evenStrength["Low_Danger"] = evenStrength["lowDangerxGoals"] - evenStrength["lowDangerGoals"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evenStrength["Medium_Danger"] = evenStren

In [35]:
def lr_cross_validation(data, target):
    #X, y = data.drop(columns=[target]).to_numpy(), data[target].to_numpy()
    X = data
    y = target
    
    clf = Ridge(positive=True)
    estimates = cross_validate(clf, X, y, cv=25, return_estimator=True)
    
    max_accuracy_index = np.argmax(estimates["test_score"])
    print(estimates["test_score"][max_accuracy_index])
    return estimates['estimator'][max_accuracy_index].coef_


In [36]:
target = modelF["GameScore"]
target2 = modelF["cap_hit"]
modelF_train = modelF.drop(columns=["GameScore"])
modelF_coef = lr_cross_validation(modelF_train.iloc[:, 10:], target)
sorted_indices = np.argsort(modelF_coef)
X = modelF_train.iloc[:, 10:]
print(X.columns[sorted_indices][::-1])
print(modelF_coef[sorted_indices][::-1])
%store modelF_coef

#cap_hit_coef = lr_cross_validation(modelF_train.iloc[:, 10:], target2)
#temp = modelF_train.iloc[:, 10:].dot(cap_hit_coef)
modelF_train["gameScore"] = modelF_train.iloc[:,10:].dot(modelF_coef)
#modelF_train["model_cap_hit"] = temp
modelF_train.to_csv('./linearRegressionRankings/aboveAvgF.csv', index=False)



0.9411736035737274
Index(['EV_xGoals_Against', 'PP_Chances', 'Production', 'Finishing',
       'Individual_primaryAssists', 'Penalty_Differential',
       'EV_Defensive_Impact', 'EV_Chances', 'PK_differential',
       'EV_differential', 'PP_differential', 'Individual_Goals',
       'PK_HighDangerAgainst', 'Individual_secondaryAssists',
       'EV_xGoalsPercentage', 'Physicality', 'EV_HighDangerAgainst',
       'High_Danger_Chances_For', 'PK_xGoals_Against', 'Defensive_Actions'],
      dtype='object')
[1.98768495 1.49998456 1.14597543 1.12449941 1.0171045  0.85096062
 0.84046512 0.8290289  0.79787426 0.49733597 0.35599511 0.25808084
 0.21881229 0.09184585 0.03489905 0.02099791 0.         0.
 0.         0.        ]
Stored 'modelF_coef' (ndarray)


In [37]:
target = modelD["GameScore"]
modelD_train = modelD.drop(columns=["GameScore"])
modelD_coef = lr_cross_validation(modelD_train.iloc[:, 12:], target)
sorted_indices = np.argsort(modelD_coef)
X = modelD_train.iloc[:, 12:]
print(X.columns[sorted_indices][::-1])
print(modelD_coef[sorted_indices][::-1])
%store modelD_coef

modelD_train["gameScore"] = modelD_train.iloc[:,12:].dot(modelD_coef)

modelD_train.to_csv('./linearRegressionRankings/aboveAvgD.csv', index=False)

0.9594614286777909
Index(['Finishing', 'Production', 'EV_Chances', 'EV_xGoals_Against',
       'PP_Chances', 'Penalty_Differential', 'EV_Defensive_Impact',
       'PK_differential', 'EV_differential', 'PP_differential',
       'Defensive_Actions', 'Physicality', 'PK_HighDangerAgainst',
       'High_Danger_Chances_For', 'EV_HighDangerAgainst', 'PK_xGoals_Against',
       'EV_xGoalsPercentage', 'Individual_secondaryAssists'],
      dtype='object')
[2.42989431 2.2225526  1.75087085 1.7258584  1.24277873 0.72494237
 0.72402374 0.26825191 0.15519973 0.13569931 0.0289028  0.01849515
 0.         0.         0.         0.         0.         0.        ]
Stored 'modelD_coef' (ndarray)


In [38]:
def get_multiplication_factor_F(row, avg_cap_hit):
    
    surplus = (201000 * row["Individual_Goals"] + 125000 * row["Individual_primaryAssists"] + 81000 * row["Individual_secondaryAssists"]
                                 + 45000 * row["Penalty_Differential"] + 81000 * row["EV_xGoals_Against"]
                                 + 51000 * row["PP_Chances"] + 71000 + row["EV_Chances"] + 51000 * row["PK_xGoals_Against"]
                                 + 55000 * row["PP_differential"] + 55000 * row["PK_differential"] + 91000 * row["EV_differential"]
                                 +  31000 * row["Finishing"] + 61000 * row["EV_HighDangerAgainst"] 
                                 + 51000 * row["PK_HighDangerAgainst"] + 35000 * row["High_Danger_Chances_For"]
                                 + 21000 * row["Defensive_Actions"] + 1000 * row["Physicality"])
    return surplus

def get_multiplication_factor_D(row):
    surplus = (201000 * row["Individual_Goals"] + 125000 * row["Individual_primaryAssists"] + 71000 * row["Individual_secondaryAssists"]
                                 + 45000 * row["Penalty_Differential"] + 101000 * row["EV_xGoals_Against"]
                                 + 31000 * row["PP_Chances"] + 51000 + row["EV_Chances"] + 61000 * row["PK_xGoals_Against"]
                                 + 45000 * row["PP_differential"] + 55000 * row["PK_differential"] + 91000 * row["EV_differential"]
                                 +  11000 * row["Finishing"] + 81000 * row["EV_HighDangerAgainst"] 
                                 + 71000 * row["PK_HighDangerAgainst"] + 35000 * row["High_Danger_Chances_For"]
                                 + 41000 * row["Defensive_Actions"] + 1000 * row["Physicality"])
    return surplus

def get_multiplication_factor_G(row):
    surplus = (301000 * row["GSAE"] + 31000 * row["Rebound_Control"] + 75000*row["Low_Danger"] + 75000 * row["Medium_Danger"]
               + 85000 * row["High_Danger"])
    
    return surplus

In [39]:
def get_surplusValueOverAvg(aboveAvgDf, position):
    avg_cap_hit = aboveAvgDf["cap_hit"].mean()
    print(avg_cap_hit)

    # .map("${0:,.2f}".format)
        

    if(position == "F"):
        aboveAvgDf["valueAboveAvg"] = aboveAvgDf.apply(lambda x: get_multiplication_factor_F(x, avg_cap_hit), axis=1)
    elif(position == "D"):
        aboveAvgDf["valueAboveAvg"] = aboveAvgDf.apply(lambda x: get_multiplication_factor_D(x), axis=1)
    elif(position == "G"):
        aboveAvgDf["valueAboveAvg"] = aboveAvgDf.apply(lambda x: get_multiplication_factor_G(x), axis=1)

get_surplusValueOverAvg(modelF_train, "F")
get_surplusValueOverAvg(modelD_train, "D")
get_surplusValueOverAvg(modelG, "G")
modelF_train.to_csv('./linearRegressionRankings/aboveAvgF.csv', index=False)
modelD_train.to_csv("./linearRegressionRankings/aboveAvgD.csv", index=False)
modelG.to_csv("./linearRegressionRankings/aboveAvgG.csv", index=False)


3945679.687671233
3866738.4660194176
3259666.661764706


In [40]:
def get_team_strength_above_avg(modelF, modelD, modelG=pd.DataFrame()):
    full_df = pd.concat([modelF, modelD, modelG])
    full_df.drop(columns=["playerId", "season", "name", "position", "EV_icetime", "PP_icetime", "PK_icetime", "team_rank"], inplace=True)
    return full_df.groupby(["team"]).sum()

In [41]:
def team_lr_cross_validation(data, target):
    #X, y = data.drop(columns=[target]).to_numpy(), data[target].to_numpy()
    X = data
    y = target
    
    clf_Ridge = Ridge(positive=True, alpha=5)
    clf_Lasso = Lasso(positive=True)
    clf_LR = LinearRegression(positive=True)
    estimates_Ridge = cross_validate(clf_Ridge, X, y, cv=14, return_estimator=True)
    estimates_Lasso = cross_validate(clf_Lasso, X, y, cv=14, return_estimator=True)
    estimates_LR = cross_validate(clf_LR, X, y, cv=14, return_estimator=True)
    

    max_accuracy_index = np.argmax(estimates_Ridge["test_score"])
    print(estimates_Ridge["test_score"][max_accuracy_index])
    return estimates_Ridge['estimator'][max_accuracy_index].coef_

In [42]:

team_strength = get_team_strength_above_avg(modelF_train, modelD_train, modelG)

team_points = {"team": ["NYR", "DAL", "CAR", "WPG", "FLA", "VAN", "BOS", "COL", "EDM", "TOR", "NSH", "LAK", "TBL",
                        "VGK", "NYI", "STL", "WSH", "DET", "PIT", "MIN", "PHI", "BUF", 'NJD', "CGY", "SEA", "OTT",
                        "ARI", "MTL", "CBJ", "ANA", "CHI", "SJS"],
                "team_points_above_avg": [114, 113, 111, 110, 110, 109, 109, 107, 104, 102, 99, 99, 98, 98, 94, 92, 91, 91, 88, 87, 87, 84, 81, 81, 81, 78, 77, 76, 66, 59, 52, 47]
                }

avg_points = sum(team_points["team_points_above_avg"]) / len(team_points["team_points_above_avg"])


team_points = pd.DataFrame.from_dict(team_points)

#team_points["team_points_above_avg"] = team_points["team_points_above_avg"] - avg_points

team_strength = pd.merge(team_strength, team_points, on=["team"])
team_strength.to_csv('./linearRegressionRankings/teamStrength.csv', index=False)

transformed_training_set = team_strength.copy()

team_strength_train = transformed_training_set[["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances", "Penalty_Differential", 
                                                "EV_xGoals_Against", "PK_xGoals_Against", 
                                                "Defensive_Actions", "GSAE",
                                                "Low_Danger", "High_Danger", "team_points_above_avg"]]



trans = transformer(team_strength_train)
target = trans["team_points_above_avg"]


team_strength_coef = team_lr_cross_validation(trans.drop(columns=["team_points_above_avg"]), target)
sorted_indices = np.argsort(team_strength_coef)
X = team_strength_train
print(X.columns[sorted_indices][::-1])
print(team_strength_coef[sorted_indices][::-1])
%store team_strength_coef 


transformed_training_set["PAA"] = team_strength_train.drop(columns=["team_points_above_avg"]).dot(team_strength_coef)
transformed_training_set.to_csv('./linearRegressionRankings/teamStrength.csv', index=False)

0.9672065511937762
Index(['Low_Danger', 'EV_xGoals_Against', 'Individual_Goals',
       'Individual_primaryAssists', 'GSAE', 'PP_Chances',
       'Penalty_Differential', 'High_Danger', 'PK_xGoals_Against',
       'Defensive_Actions', 'EV_Chances'],
      dtype='object')
[0.25980789 0.23117638 0.20448314 0.1943426  0.17111095 0.1334338
 0.1171459  0.09680517 0.08502167 0.00726813 0.0013436 ]
Stored 'team_strength_coef' (ndarray)


In [43]:
skater_coef, goalie_coef = team_strength_coef[:8], team_strength_coef[8:]
offence_coef, defence_coef = skater_coef[:4], skater_coef[5:]
skater_stats = ["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances", "Penalty_Differential", 
                "EV_xGoals_Against", "PK_xGoals_Against", "Defensive_Actions"]
offence = ["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances"]
defence = ["EV_xGoals_Against", "PK_xGoals_Against", "Defensive_Actions"]
goalie_stats = ["GSAE", "Low_Danger", "High_Danger"]

modelF_train["Offence"] = modelF_train[offence].dot(offence_coef)
modelF_train["Offence_Ranking"] = get_percentile_rank(modelF_train["Offence"])
modelF_train["Defence"] = modelF_train[defence].dot(defence_coef)
modelF_train["Defence_Ranking"] = get_percentile_rank(modelF_train["Defence"])
modelF_train["Penalty_Ranking"] = get_percentile_rank(modelF_train["Penalty_Differential"])
modelF_train["Physicality_Ranking"] = get_percentile_rank(modelF_train["Physicality"])
modelF_train["Production_Ranking"] = get_percentile_rank(modelF_train["Production"])
modelF_train["Finishing_Ranking"] = get_percentile_rank(modelF_train["Finishing"])
modelF_train["PWAA"] = modelF_train[skater_stats].dot(skater_coef)
modelF_train["PWAA_Ranking"] = get_percentile_rank(modelF_train["PWAA"])

modelD_train["Offence"] = modelD_train[offence].dot(offence_coef)
modelD_train["Offence_Ranking"] = get_percentile_rank(modelD_train["Offence"])
modelD_train["Defence"] = modelD_train[defence].dot(defence_coef)
modelD_train["Defence_Ranking"] = get_percentile_rank(modelD_train["Defence"])
modelD_train["Penalty_Ranking"] = get_percentile_rank(modelD_train["Penalty_Differential"])
modelD_train["Physicality_Ranking"] = get_percentile_rank(modelD_train["Physicality"])
modelD_train["Production_Ranking"] = get_percentile_rank(modelD_train["Production"])
modelD_train["Finishing_Ranking"] = get_percentile_rank(modelD_train["Finishing"])
modelD_train["PWAA"] = modelD_train[skater_stats].dot(skater_coef)
modelD_train["PWAA_Ranking"] = get_percentile_rank(modelD_train["PWAA"])

modelG["PWAA"] = modelG[goalie_stats].dot(goalie_coef)
modelG[["GSAE_Ranking", "Low_Danger_Ranking", "Medium_Danger_Ranking", "High_Danger_Ranking", "Rebound_Control_Ranking"]] = get_percentile_rank(modelG[["GSAE", "Low_Danger", "Medium_Danger", "High_Danger", "Rebound_Control"]])
modelG["PWAA_Ranking"] = get_percentile_rank(modelG["PWAA"])

modelF_train.to_csv("./linearRegressionRankings/aboveAvgF.csv", index=False)
modelD_train.to_csv("./linearRegressionRankings/aboveAvgD.csv", index=False)
modelG.to_csv("./linearRegressionRankings/aboveAvgG.csv", index=False)


engine = create_engine("postgresql://postgres:luszalytics@127.0.0.1:5432/hockey_data")
modelF_train.to_sql("PWAA_forwards", engine, if_exists="replace", index=False, schema="PWAA")
modelD_train.to_sql("PWAA_defence", engine, if_exists='replace', index=False, schema="PWAA")
modelG.to_sql("PWAA_goalies", engine, if_exists='replace', index=False, schema="PWAA")
transformed_training_set.to_sql("PWAA_teams", engine, if_exists='replace', index=False, schema="PWAA")

32