In [1]:
#import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import jenkspy

#import and read cleaned_2022
import pandas as pd
players_df = pd.read_csv("model_2022-2023.csv")
players_df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,position,games_played,icetime,minutes/game,onIce_xGoalsPercentage,onIce_corsiPercentage,onIce_fenwickPercentage,iceTimeRank,...,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,Line_or_Pair
0,0,0,R,73,40825,9.32,0.45,0.45,0.46,814,...,674.73,491,489.13,4.67,6.2,6.2,27.06,26.92,26.43,3rd line
1,1,1,D,68,61236,15.01,0.39,0.42,0.41,329,...,1104.67,863,864.75,10.65,15.01,14.72,62.63,62.82,60.81,3rd pair
2,2,2,D,61,47720,13.04,0.44,0.45,0.46,338,...,791.6,562,565.64,5.69,5.62,5.62,34.16,34.3,33.73,3rd pair
3,3,3,D,42,35331,14.02,0.56,0.53,0.52,242,...,508.74,378,379.33,3.76,7.03,7.03,21.02,21.1,20.84,3rd pair
4,4,4,C,28,26877,16.0,0.52,0.46,0.46,139,...,425.03,316,313.93,3.19,2.47,2.47,20.77,20.86,20.65,2nd line


In [2]:
#drop unnamed columns
players_df.drop(['Unnamed: 0', 'Unnamed: 1'], axis='columns', inplace=True)
players_df.head()

Unnamed: 0,position,games_played,icetime,minutes/game,onIce_xGoalsPercentage,onIce_corsiPercentage,onIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,...,OnIce_A_scoreAdjustedShotsAttempts,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,Line_or_Pair
0,R,73,40825,9.32,0.45,0.45,0.46,814,49.47,6.7,...,674.73,491,489.13,4.67,6.2,6.2,27.06,26.92,26.43,3rd line
1,D,68,61236,15.01,0.39,0.42,0.41,329,37.1,1.36,...,1104.67,863,864.75,10.65,15.01,14.72,62.63,62.82,60.81,3rd pair
2,D,61,47720,13.04,0.44,0.45,0.46,338,69.79,2.44,...,791.6,562,565.64,5.69,5.62,5.62,34.16,34.3,33.73,3rd pair
3,D,42,35331,14.02,0.56,0.53,0.52,242,32.17,1.21,...,508.74,378,379.33,3.76,7.03,7.03,21.02,21.1,20.84,3rd pair
4,C,28,26877,16.0,0.52,0.46,0.46,139,39.69,6.81,...,425.03,316,313.93,3.19,2.47,2.47,20.77,20.86,20.65,2nd line


In [3]:
#value count the I_F_points column to see if we need to bin the data
points_count = players_df['I_F_points'].value_counts()
points_count

3      26
1      24
14     24
8      23
16     22
       ..
85      1
84      1
71      1
77      1
153     1
Name: I_F_points, Length: 102, dtype: int64

In [4]:
#use jenks_breaks to find the natural breaks in the data, these breaks will become our bin values
point_breaks = jenkspy.jenks_breaks(players_df['I_F_points'], n_classes=10)
point_breaks

[0, 7, 15, 24, 34, 46, 58, 70, 87, 113, 153]

In [5]:
#create function to cycle through the data and place them in the bins defined in the previous step
def point_bucket(column):
    if column['I_F_points'] < 7:
        return 0.1
    elif column['I_F_points'] < 15:
        return 0.2
    elif column['I_F_points'] < 24:
        return 0.3
    elif column['I_F_points'] < 34:
        return 0.4
    elif column['I_F_points'] < 46:
        return 0.5
    elif column['I_F_points'] < 58:
        return 0.6
    elif column['I_F_points'] < 70:
        return 0.7
    elif column['I_F_points'] < 87:
        return 0.8
    elif column['I_F_points'] < 113:
        return 0.9
    else:
        return 1
    

# Apply the function to create the new column
players_df['point_bucket'] = players_df.apply(point_bucket, axis=1)

#value count to see the distribution of the data in your bins
players_df['point_bucket'].value_counts()

0.3    142
0.2    137
0.1    129
0.4    116
0.5     97
0.6     53
0.8     40
0.7     40
0.9     16
1.0      4
Name: point_bucket, dtype: int64

In [6]:
players_df.drop(['I_F_points'], axis='columns', inplace=True)
players_df.head()

Unnamed: 0,position,games_played,icetime,minutes/game,onIce_xGoalsPercentage,onIce_corsiPercentage,onIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,...,OnIce_A_unblockedShotAttempts,OnIce_A_scoreAdjustedUnblockedShotAttempts,OnIce_A_xGoalsFromxReboundsOfShots,OnIce_A_xGoalsFromActualReboundsOfShots,OnIce_A_reboundxGoals,OnIce_A_xGoals_with_earned_rebounds,OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted,OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted,Line_or_Pair,point_bucket
0,R,73,40825,9.32,0.45,0.45,0.46,814,49.47,6.7,...,491,489.13,4.67,6.2,6.2,27.06,26.92,26.43,3rd line,0.3
1,D,68,61236,15.01,0.39,0.42,0.41,329,37.1,1.36,...,863,864.75,10.65,15.01,14.72,62.63,62.82,60.81,3rd pair,0.2
2,D,61,47720,13.04,0.44,0.45,0.46,338,69.79,2.44,...,562,565.64,5.69,5.62,5.62,34.16,34.3,33.73,3rd pair,0.1
3,D,42,35331,14.02,0.56,0.53,0.52,242,32.17,1.21,...,378,379.33,3.76,7.03,7.03,21.02,21.1,20.84,3rd pair,0.2
4,C,28,26877,16.0,0.52,0.46,0.46,139,39.69,6.81,...,316,313.93,3.19,2.47,2.47,20.77,20.86,20.65,2nd line,0.3


In [7]:
#convert categorical data into numeric with 'get_dummies'
players_df = pd.get_dummies(players_df)
players_df.head()

Unnamed: 0,games_played,icetime,minutes/game,onIce_xGoalsPercentage,onIce_corsiPercentage,onIce_fenwickPercentage,iceTimeRank,I_F_xOnGoal,I_F_xGoals,I_F_xRebounds,...,position_C,position_D,position_L,position_R,Line_or_Pair_1st line,Line_or_Pair_1st pair,Line_or_Pair_2nd line,Line_or_Pair_2nd pair,Line_or_Pair_3rd line,Line_or_Pair_3rd pair
0,73,40825,9.32,0.45,0.45,0.46,814,49.47,6.7,3.8,...,0,0,0,1,0,0,0,0,1,0
1,68,61236,15.01,0.39,0.42,0.41,329,37.1,1.36,1.83,...,0,1,0,0,0,0,0,0,0,1
2,61,47720,13.04,0.44,0.45,0.46,338,69.79,2.44,3.85,...,0,1,0,0,0,0,0,0,0,1
3,42,35331,14.02,0.56,0.53,0.52,242,32.17,1.21,1.81,...,0,1,0,0,0,0,0,0,0,1
4,28,26877,16.0,0.52,0.46,0.46,139,39.69,6.81,2.85,...,1,0,0,0,0,0,1,0,0,0


In [8]:
#split data into features and target arrays
y = players_df.point_bucket.values
X = players_df.drop(columns='point_bucket').values

#split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
#create standardscaler instance
scaler = StandardScaler()

#fit the standardscaler
X_scaler = scaler.fit(X_train)

#scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 20)                2860      
                                                                 
 dense_1 (Dense)             (None, 20)                420       
                                                                 
 dense_2 (Dense)             (None, 20)                420       
                                                                 
 dense_3 (Dense)             (None, 20)                420       
                                                                 
 dense_4 (Dense)             (None, 20)                420       
                                                                 
 dense_5 (Dense)             (None, 20)                420       
                                                                 
 dense_6 (Dense)             (None, 1)                 2

In [11]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7/7 - 0s - loss: 0.5578 - accuracy: 0.0052 - 91ms/epoch - 13ms/step
Loss: 0.5578039288520813, Accuracy: 0.005154639016836882
