In [132]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, ridge_regression, Lasso, LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, accuracy_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, r_regression
from sympy import Point
import os, shutil
from scipy.stats import percentileofscore
import math
from sqlalchemy import create_engine
np.warnings = warnings

%store -r transformed_defence_data
%store -r evenStrengthD_names
%store -r transformed_ppD_data
%store -r powerPlayD_names
%store -r transformed_pkD_data
%store -r penaltyKillD_names
%store -r transformed_forward_data
%store -r evenStrengthF_names
%store -r transformed_ppF_data
%store -r powerPlayF_names
%store -r transformed_pkF_data
%store -r penaltyKillF_names

%store -r evenStrengthF
%store -r powerPlayF
%store -r penaltyKillF
%store -r evenStrengthD
%store -r penaltyKillD
%store -r powerPlayD

%store -r evenStrengthG
%store -r powerPlayG
%store -r penaltyKillG

In [133]:
# Use a transformer to scale the data before doing regression analysis
def transformer(df):
    pt = PowerTransformer()
    pt.set_output(transform='pandas')
    new_df = pt.fit_transform(df)

    return new_df

In [134]:
# Return the percentile of each player's stats in a given dataframe
def get_percentile_rank(df):
    return df.rank(pct=True).mul(100)

In [135]:
# Calculate the important "above-average" stats so that we can perform regression analysis
def calculateAboveAverageStats(evenStrength, powerPlay, penaltyKill):
    new_df = pd.DataFrame()

    # Filter each type of game situation so that each player plays at least x seconds to avoid outliers
    evenStrength = evenStrength.loc[evenStrength.icetime > 5000]
    powerPlay = powerPlay.loc[powerPlay.icetime > 3000]
    penaltyKill = penaltyKill.loc[penaltyKill.icetime > 3000]

    # Split Names and data for each game situation
    EV_name, EV_data = evenStrength.iloc[:, :5], evenStrength.iloc[:,6:]
    PP_name, PP_data = powerPlay.iloc[:, :5], powerPlay.iloc[:,6:]
    PK_name, PK_data = penaltyKill.iloc[:, :5], penaltyKill.iloc[:,6:]

    # Add prefix for even strength stats
    EV_data = EV_data.add_prefix("EV_")

    # Add team_rank and cap_hit statistics
    # team_rank = evenStrength["team_rank"]
    # team_rank.reset_index(drop=True, inplace=True)
    # cap_hit = evenStrength["cap_hit"]
    # cap_hit.reset_index(drop=True, inplace=True)

    # Add prefixes for each game situation
    PP_data = PP_data.add_prefix("PP_")
    PK_data = PK_data.add_prefix("PK_")

    # Divide each piece of information by seconds played, then subtract it from the average player in each stat
    EV_data.iloc[:, 9:] = EV_data.iloc[:, 9:].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:, 9:] = PP_data.iloc[:, 9:].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:, 9:] = PK_data.iloc[:, 9:].div(PK_data.PK_icetime, axis=0)

    EV_avg = EV_data.mean().drop('EV_icetime')
    PP_avg = PP_data.mean().drop("PP_icetime")
    PK_avg = PK_data.mean().drop("PK_icetime")


    EV_data.iloc[:,1:] = (EV_data.iloc[:,1:] - EV_avg).mul(EV_data.EV_icetime, axis=0)
    EV_data.iloc[:,1:9] = EV_data.iloc[:, 1:9].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:,1:] = (PP_data.iloc[:,1:] - PP_avg).mul(PP_data.PP_icetime, axis=0)
    PP_data.iloc[:,1:9] = PP_data.iloc[:, 1:9].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:,1:] = (PK_data.iloc[:,1:] - PK_avg).mul(PK_data.PK_icetime, axis=0)
    PK_data.iloc[:,1:9] = PK_data.iloc[:, 1:9].div(PK_data.PK_icetime, axis=0)

    # Merge all data into a single dataframe
    evenStrength = pd.concat([EV_name, EV_data], axis=1)
    powerPlay = pd.concat([PP_name, PP_data], axis=1)
    penaltyKill = pd.concat([PK_name, PK_data], axis=1)

    full_df = pd.merge(evenStrength, powerPlay, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    full_df = pd.merge(full_df, penaltyKill, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    full_df.fillna(0, inplace=True)

    new_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']] = full_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']]
    # new_df["team_rank"] = team_rank
    # new_df["cap_hit"] = cap_hit

    # 1. Calculate IG
    new_df["Individual_Goals"] = full_df["EV_I_F_goals"] + full_df["PP_I_F_goals"] + full_df["PK_I_F_goals"]

    # 2. Calculate Individual primary assists
    new_df["Individual_primaryAssists"] = full_df["EV_I_F_primaryAssists"] + full_df["PP_I_F_primaryAssists"] + full_df["PK_I_F_primaryAssists"]

    # 3. Secondary Assists above Average
    new_df["Individual_secondaryAssists"] = full_df["EV_I_F_secondaryAssists"] + full_df["PP_I_F_secondaryAssists"] + full_df["PK_I_F_secondaryAssists"]

    # 4. Calculate total production
    new_df["Production"] = new_df["Individual_Goals"] + new_df["Individual_primaryAssists"] + new_df["Individual_secondaryAssists"]

    # 5. Calculate Penalty Differential
    new_df["Penalty_Differential"] = ((full_df["EV_penaltiesDrawn"] + full_df["PP_penaltiesDrawn"] + full_df["PK_penaltiesDrawn"]) -
                                      (full_df["EV_penalties"] + full_df["PP_penalties"] + full_df["PK_penalties"])  )

    # 6. Even Strength xGoals for %: onIce_xGoalsPercentage (EV) scaled by 100
    new_df["EV_xGoalsPercentage"] = full_df["EV_onIce_xGoalsPercentage"]

    # 7. Even strength chances given up: OnIce_A_xGoals (EV)
    new_df["EV_xGoals_Against"] = -1* (full_df["EV_OnIce_A_xGoals"] + full_df["EV_OnIce_A_flurryAdjustedxGoals"] + full_df["EV_OnIce_A_scoreVenueAdjustedxGoals"]) / 3

    # 8. Powerplay Chances: (I_F_xGoals (PP) + OnIce_F_xGoals (PP)) - give a boost if you are getting you own chances
    new_df["PP_Chances"] = full_df["PP_I_F_xGoals"] + full_df["PP_OnIce_F_xGoals"]

    # 9. Even strength Chances : (I_F_xGoals (EV) + OnIce_F_xGoals (EV)) - give a boost if you are getting you own chances
    new_df["EV_Chances"] = full_df["EV_I_F_xGoals"] + full_df["EV_OnIce_F_xGoals"]

    # 10. Penalty Kill Chances given up: Onice_A_xGoals (PK)
    new_df["PK_xGoals_Against"] = -1* (full_df["PK_OnIce_A_xGoals"] + full_df["PK_OnIce_A_flurryAdjustedxGoals"] + full_df["PK_OnIce_A_scoreVenueAdjustedxGoals"]) / 3

    # 11. Powerplay expected +/-: OnIce_F_goals (PP) - OnIce_A_goals (PP)
    new_df["PP_differential"] = full_df["PP_OnIce_F_goals"] - full_df["PP_OnIce_A_goals"]

    # 12. Penalty Kill expected +/-: OnIce_F_goals (PK) - OnIce_A_goals (PK)
    new_df["PK_differential"] = full_df["PK_OnIce_F_goals"] - full_df["PK_OnIce_A_goals"]

    # 13. EV +/-: OnIce_F_goals (EV) - OnIce_A_goals (EV)
    new_df["EV_differential"] = full_df["EV_OnIce_F_goals"] - full_df["EV_OnIce_A_goals"]

    # 14. Finishing: finishing (goals - xGoals)
    new_df["Finishing"] = ((full_df["EV_I_F_goals"] + full_df["PP_I_F_goals"] + full_df["PK_I_F_goals"]) - 
                           (full_df["EV_I_F_xGoals"] + full_df["PP_I_F_xGoals"] + full_df["PK_I_F_xGoals"]))
    # 15. Physicality: I_F_hits
    new_df["Physicality"] = full_df["EV_I_F_hits"] + full_df["PP_I_F_hits"] + full_df["PK_I_F_hits"]

    # 16. Calculate defensive actions (blocks + takeaways)
    # new_df["Defensive_Actions"] = (full_df["EV_shotsBlockedByPlayer"] + full_df["PK_shotsBlockedByPlayer"] + full_df["PP_shotsBlockedByPlayer"]
    #                                 + full_df["EV_I_F_takeaways"] + full_df["PP_I_F_takeaways"] + full_df["PK_I_F_takeaways"])

    # 16. Calculate Shots against
    new_df["EV_Shots_Against"] = -1 * (full_df["EV_OnIce_A_shotAttempts"])

    # 17. EV High Danger Chances Against
    new_df["EV_HighDangerAgainst"] = -1* (full_df["EV_OnIce_A_highDangerxGoals"] + full_df["EV_OnIce_A_highDangerShots"])

    # 18. PK High Danger Chances Against
    new_df["PK_HighDangerAgainst"] = -1* (full_df["PK_OnIce_A_highDangerxGoals"] + full_df["PK_OnIce_A_highDangerShots"])

    # 19. High Danger chances for
    new_df["High_Danger_Chances_For"] = (full_df["EV_I_F_highDangerxGoals"] + full_df["PP_I_F_highDangerxGoals"] + full_df["PK_I_F_highDangerxGoals"]
                                         + full_df["EV_I_F_highDangerShots"] + full_df["PP_I_F_highDangerShots"] + full_df["PK_I_F_highDangerShots"])

    # 20. Calculate gameScore
    new_df["gameScore"] = full_df["EV_gameScore"] + full_df["PP_gameScore"] + full_df["PK_gameScore"]

    # 21. Calculate even strength defensive imapact (implemented in future)
    new_df["EV_Defensive_Impact"] = (full_df["EV_OnIce_A_xGoals"] - full_df["EV_OffIce_A_xGoals"])

    return new_df

# Calculate the above average stats for forwards and defence
modelF = calculateAboveAverageStats(evenStrengthF, powerPlayF, penaltyKillF)
modelD = calculateAboveAverageStats(evenStrengthD, powerPlayD, penaltyKillD)

  PK_data.iloc[:, 9:] = PK_data.iloc[:, 9:].div(PK_data.PK_icetime, axis=0)


In [136]:
# Calculate the important "above-average" stats for goaltenders
def calculateAboveAverageGoalieStats(evenStrength, powerPlay, penaltyKill):
    new_df = pd.DataFrame()

    # Filter each type of game situation so that each player plays at least x seconds to avoid outliers
    evenStrength = evenStrength.loc[evenStrength.icetime > 5000].copy()
    powerPlay = powerPlay.loc[powerPlay.icetime > 3000].copy()
    penaltyKill = penaltyKill.loc[penaltyKill.icetime > 3000].copy()

    # Calculate GSAE, Low_Danger, Medium_Danger, and High_Danger stats for all situations
    evenStrength["GSAE"] = evenStrength["xGoals"] - evenStrength["goals"]
    
    evenStrength["Low_Danger"] = evenStrength["lowDangerxGoals"] - evenStrength["lowDangerGoals"]
    evenStrength["Medium_Danger"] = evenStrength["mediumDangerxGoals"] - evenStrength["mediumDangerGoals"]
    evenStrength["High_Danger"] = evenStrength["highDangerxGoals"] - evenStrength["highDangerGoals"]

    powerPlay["GSAE"] = powerPlay["xGoals"] - powerPlay["goals"]
    powerPlay["Low_Danger"] = powerPlay["lowDangerxGoals"] - powerPlay["lowDangerGoals"]
    powerPlay["Medium_Danger"] = powerPlay["mediumDangerxGoals"] - powerPlay["mediumDangerGoals"]
    powerPlay["High_Danger"] = powerPlay["highDangerxGoals"] - powerPlay["highDangerGoals"]

    penaltyKill["GSAE"] = penaltyKill["xGoals"] - penaltyKill["goals"]
    penaltyKill["Low_Danger"] = penaltyKill["lowDangerxGoals"] - penaltyKill["lowDangerGoals"]
    penaltyKill["Medium_Danger"] = penaltyKill["mediumDangerxGoals"] - penaltyKill["mediumDangerGoals"]
    penaltyKill["High_Danger"] = penaltyKill["highDangerxGoals"] - penaltyKill["highDangerGoals"]

    # Split Names and data for each game situation
    EV_name, EV_data = evenStrength.iloc[:, :5], evenStrength.iloc[:,6:]
    PP_name, PP_data = powerPlay.iloc[:, :5], powerPlay.iloc[:,6:]
    PK_name, PK_data = penaltyKill.iloc[:, :5], penaltyKill.iloc[:,6:]
    EV_data = EV_data.add_prefix("EV_")

    # Add team_rank and cap_hit stats
    # team_rank = evenStrength["team_rank"]
    # team_rank.reset_index(drop=True, inplace=True)
    # cap_hit = evenStrength["cap_hit"]
    # cap_hit.reset_index(drop=True, inplace=True)
    
    # Add prefixes to differentiate stats
    PP_data = PP_data.add_prefix("PP_")
    PK_data = PK_data.add_prefix("PK_")

    # Divide every goalie's total stats by their seconds played
    EV_data.iloc[:, 1:] = EV_data.iloc[:, 1:].div(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:, 1:] = PP_data.iloc[:, 1:].div(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:, 1:] = PK_data.iloc[:, 1:].div(PK_data.PK_icetime, axis=0)

    # Calculate the average player
    EV_avg = EV_data.mean().drop('EV_icetime')
    PP_avg = PP_data.mean().drop("PP_icetime")
    PK_avg = PK_data.mean().drop("PK_icetime")

    # Calcualte the "above-average" versions of the statistics
    EV_data.iloc[:,1:] = (EV_data.iloc[:,1:] - EV_avg).mul(EV_data.EV_icetime, axis=0)
    PP_data.iloc[:,1:] = (PP_data.iloc[:,1:] - PP_avg).mul(PP_data.PP_icetime, axis=0)
    PK_data.iloc[:,1:] = (PK_data.iloc[:,1:] - PK_avg).mul(PK_data.PK_icetime, axis=0)

    # Merge the 3 situations into one dataframe
    evenStrength = pd.concat([EV_name, EV_data], axis=1)
    powerPlay = pd.concat([PP_name, PP_data], axis=1)
    penaltyKill = pd.concat([PK_name, PK_data], axis=1)

    full_df = pd.merge(evenStrength, powerPlay, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    full_df = pd.merge(full_df, penaltyKill, how='left', on=['playerId', 'season', 'name', 'team', 'position'])
    
    full_df.fillna(0, inplace=True)

    new_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']] = full_df[['playerId', 'season', 'name', 'team', 'position', 'EV_icetime', 'PP_icetime', 'PK_icetime']]
    # new_df["team_rank"] = team_rank
    # new_df["cap_hit"] = cap_hit

    # Calculate the total above average versions of the stats
    new_df["GSAE"] = full_df["EV_GSAE"] + full_df["PP_GSAE"] + full_df["PK_GSAE"]
    new_df["Rebound_Control"] = full_df["EV_rebounds"] + full_df["PP_rebounds"] + full_df["PK_rebounds"]
    new_df["Low_Danger"] = full_df["EV_Low_Danger"] + full_df["PP_Low_Danger"] + full_df["PK_Low_Danger"]
    new_df["Medium_Danger"] = full_df["EV_Medium_Danger"] + full_df["PP_Medium_Danger"] + full_df["PK_Medium_Danger"]
    new_df["High_Danger"] = full_df["EV_High_Danger"] + full_df["PP_High_Danger"] + full_df["PK_High_Danger"]

    return new_df

# Calculate the "above-average" stats for goalies
modelG = calculateAboveAverageGoalieStats(evenStrengthG, powerPlayG, penaltyKillG)


In [137]:
# Calculate team strength, which is the sum of each above average statistic
def get_team_strength_above_avg(modelF, modelD, modelG=pd.DataFrame()):
    full_df = pd.concat([modelF, modelD, modelG])
    full_df.drop(columns=["playerId", "season", "name", "position", "EV_icetime", "PP_icetime", "PK_icetime"], inplace=True)
    return full_df.groupby(["team"]).sum()

Split the data

In [138]:
def get_split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=20)
    return X_train, X_test, y_train, y_test

In [139]:
# Calculate team strength
team_strength = get_team_strength_above_avg(modelF, modelD, modelG)

# Get the team points for the 2023-2024 regular season
team_points = {"team": ["NYR", "DAL", "CAR", "WPG", "FLA", "VAN", "BOS", "COL", "EDM", "TOR", "NSH", "LAK", "TBL",
                        "VGK", "NYI", "STL", "WSH", "DET", "PIT", "MIN", "PHI", "BUF", 'NJD', "CGY", "SEA", "OTT",
                        "ARI", "MTL", "CBJ", "ANA", "CHI", "SJS"],
                "regular_season_points": [114, 113, 111, 110, 110, 109, 109, 107, 104, 102, 99, 99, 98, 98, 94, 92, 91, 91, 88, 87, 87, 84, 81, 81, 81, 78, 77, 76, 66, 59, 52, 47],
                "playoff_points": [20, 20, 12, 2, 32, 14, 12, 12, 30, 6, 4, 2, 2, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                }

# Calculate the average points
avg_points = sum(team_points["regular_season_points"]) / len(team_points["regular_season_points"])

print(avg_points)

team_points = pd.DataFrame.from_dict(team_points)
team_points["total_points"] = team_points["regular_season_points"] + team_points["playoff_points"]
team_strength = pd.merge(team_strength, team_points, on=["team"])
team_strength.to_csv('./pwaaRankings/teamStrength.csv', index=False)
transformed_training_set = team_strength.copy()
team_strength_train = transformed_training_set[["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances", "Penalty_Differential", 
                                                "EV_xGoals_Against", "PK_xGoals_Against", 
                                                "EV_Shots_Against", "GSAE",
                                                "Low_Danger", "High_Danger", "total_points"]]

# Scale the data and set the team points as our target variable
trans = transformer(team_strength_train)
target = (trans["total_points"]).to_numpy()

data = trans.drop(columns=["total_points"]).to_numpy()

X_train, X_test, y_train, y_test = get_split_data(data, target)

print(data.shape)
print(target.shape)




90.46875
(32, 11)
(32,)


minimize ||y — Xw||² + alpha * ||w||²

w = inv(X^TX + alpha * I) * X^Ty

In [140]:
# Calculate the ridge regression weights based on the minimization function above and the derived formula for the weights
def get_ridge_regression_params(X, y, alpha):

    xtx = X.T @ X
    alpha_matrix = alpha * np.identity(xtx.shape[0])
    xty = X.T @ y
    inverse = np.linalg.inv(xtx + alpha_matrix)
    return inverse @ xty

get_ridge_regression_params(X_train, y_train, 5)

array([0.19079252, 0.1845149 , 0.09272865, 0.13882376, 0.09149657,
       0.23409686, 0.08623812, 0.0789853 , 0.19261909, 0.14344997,
       0.04719806])

In [126]:
# Return the mean squared error
def get_mean_squared_error(y_truth, y_pred):
    return np.mean((y_truth -y_pred)**2)

In [146]:
# Complete an n-fold cross validation and return the most accurate weights
def n_fold_cross_validation(X, y):

    partitions = KFold(n_splits=15)

    best_mse = 1000
    best_w = 0

    for fold, (train, test) in enumerate(partitions.split(X, y)):
        #Split the data
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        # Calculate the weights based on the trained data
        w = get_ridge_regression_params(X_train, y_train, 12)

        # Predict the points above average
        y_pred = w.T @ X_test.T

        # Calculate the mean squared error
        mse = get_mean_squared_error(y_test, y_pred)

        # Check to see which are the best weights
        if mse < best_mse:
            best_mse = mse
            best_w = w
        
    print(best_mse)
    print(best_w)

    return best_w


weights = n_fold_cross_validation(data, target)

["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances", "Penalty_Differential", 
                                                "EV_xGoals_Against", "PK_xGoals_Against", 
                                                "EV_Shots_Against", "GSAE",
                                                "Low_Danger", "High_Danger", "team_points_above_avg"]



0.0017832394355512754
[0.15710837 0.18700035 0.07896175 0.15226094 0.08259379 0.18942745
 0.07700952 0.08218665 0.14194735 0.19054812 0.08303829]


['Individual_Goals',
 'Individual_primaryAssists',
 'EV_Chances',
 'PP_Chances',
 'Penalty_Differential',
 'EV_xGoals_Against',
 'PK_xGoals_Against',
 'EV_Shots_Against',
 'GSAE',
 'Low_Danger',
 'High_Danger',
 'team_points_above_avg']

In [144]:
# Calculate the PWAA based on the weights
transformed_training_set["PWAA"] = trans.drop(columns=["total_points"]).dot(weights)
transformed_training_set.to_csv('./pwaaRankings/teamStrength.csv', index=False)

In [114]:
# Calculate the pwaa, offence, and defence stats for forwards and defencemen
skater_coef, goalie_coef = weights[:8], weights[8:]
offence_coef, defence_coef = skater_coef[:4], skater_coef[5:]
skater_stats = ["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances", "Penalty_Differential", 
                "EV_xGoals_Against", "PK_xGoals_Against", "EV_Shots_Against"]
offence = ["Individual_Goals", "Individual_primaryAssists", "EV_Chances", "PP_Chances"]
defence = ["EV_xGoals_Against", "PK_xGoals_Against", "EV_Shots_Against"]
goalie_stats = ["GSAE", "Low_Danger", "High_Danger"]


scaledModelF = transformer(modelF.iloc[:, 8:])
scaledModelD = transformer(modelD.iloc[:, 8:])
scaledModelG = transformer(modelG.iloc[:, 8:])

modelF["Offence"] = scaledModelF[offence].dot(offence_coef) * (avg_points / 20)
modelF["Offence_Ranking"] = get_percentile_rank(modelF["Offence"])
modelF["Defence"] = scaledModelF[defence].dot(defence_coef) * (avg_points / 20)
modelF["Defence_Ranking"] = get_percentile_rank(modelF["Defence"])
modelF["Penalty_Ranking"] = get_percentile_rank(modelF["Penalty_Differential"])
modelF["Physicality_Ranking"] = get_percentile_rank(modelF["Physicality"])
modelF["Production_Ranking"] = get_percentile_rank(modelF["Production"])
modelF["Finishing_Ranking"] = get_percentile_rank(modelF["Finishing"])
modelF["PWAA"] = scaledModelF[skater_stats].dot(skater_coef) * (avg_points / 20)
modelF["PWAA_Ranking"] = get_percentile_rank(modelF["PWAA"])

modelD["Offence"] = scaledModelD[offence].dot(offence_coef) * (avg_points / 20)
modelD["Offence_Ranking"] = get_percentile_rank(modelD["Offence"])
modelD["Defence"] = scaledModelD[defence].dot(defence_coef) * (avg_points / 20)
modelD["Defence_Ranking"] = get_percentile_rank(modelD["Defence"])
modelD["Penalty_Ranking"] = get_percentile_rank(modelD["Penalty_Differential"])
modelD["Physicality_Ranking"] = get_percentile_rank(modelD["Physicality"])
modelD["Production_Ranking"] = get_percentile_rank(modelD["Production"])
modelD["Finishing_Ranking"] = get_percentile_rank(modelD["Finishing"])
modelD["PWAA"] = scaledModelD[skater_stats].dot(skater_coef) * (avg_points / 20)
modelD["PWAA_Ranking"] = get_percentile_rank(modelD["PWAA"])

modelG["PWAA"] = scaledModelG[goalie_stats].dot(goalie_coef) * (avg_points / 20)
modelG[["GSAE_Ranking", "Low_Danger_Ranking", "Medium_Danger_Ranking", "High_Danger_Ranking", "Rebound_Control_Ranking"]] = get_percentile_rank(modelG[["GSAE", "Low_Danger", "Medium_Danger", "High_Danger", "Rebound_Control"]])
modelG["PWAA_Ranking"] = get_percentile_rank(modelG["PWAA"])

modelF.to_csv("./pwaaRankings/aboveAvgF.csv", index=False)
modelD.to_csv("./pwaaRankings/aboveAvgD.csv", index=False)
modelG.to_csv("./pwaaRankings/aboveAvgG.csv", index=False)

In [115]:
# Add data to posgres database
engine = create_engine("postgresql://postgres:luszalytics@127.0.0.1:5432/hockey_data")
modelF.to_sql("PWAA_forwards", engine, if_exists="replace", index=False, schema="PWAA")
modelD.to_sql("PWAA_defence", engine, if_exists='replace', index=False, schema="PWAA")
modelG.to_sql("PWAA_goalies", engine, if_exists='replace', index=False, schema="PWAA")
transformed_training_set.to_sql("PWAA_teams", engine, if_exists='replace', index=False, schema="PWAA")

32