### Import Libraries

In [117]:
import pandas as pd
import numpy as np

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

### Load Dataset

In [17]:
df = pd.read_csv("/Users/yexuanshen/Desktop/UTMMA/UTMMA_Datathon/Rotman MMA Summer Datathon NWHL.csv")

In [18]:
df.head()

Unnamed: 0,game_date,Home Team,Away Team,Period,Clock,Home Team Skaters,Away Team Skaters,Home Team Goals,Away Team Goals,Team,...,Event,X Coordinate,Y Coordinate,Detail 1,Detail 2,Detail 3,Detail 4,Player 2,X Coordinate 2,Y Coordinate 2
0,2021-01-23,Minnesota Whitecaps,Boston Pride,1,20:00,5,5,0,0,Boston Pride,...,Faceoff Win,100,43,Backhand,,,,Stephanie Anderson,,
1,2021-01-23,Minnesota Whitecaps,Boston Pride,1,19:58,5,5,0,0,Boston Pride,...,Puck Recovery,107,40,,,,,,,
2,2021-01-23,Minnesota Whitecaps,Boston Pride,1,19:57,5,5,0,0,Boston Pride,...,Zone Entry,125,28,Carried,,,,Maddie Rowe,,
3,2021-01-23,Minnesota Whitecaps,Boston Pride,1,19:55,5,5,0,0,Boston Pride,...,Shot,131,28,Snapshot,On Net,t,f,,,
4,2021-01-23,Minnesota Whitecaps,Boston Pride,1,19:53,5,5,0,0,Boston Pride,...,Faceoff Win,169,21,Backhand,,,,Stephanie Anderson,,


-------

### Preprocessing from Alvin's `draft.ipynb`

consider "Events" only, instead of "Event" + "Detail 1".

In [97]:
TEST_new_df = df.copy()

TEST_new_df['game_id'] = df.groupby(['game_date', 'Home Team', 'Away Team'], sort=False).ngroup()+1

first_column = TEST_new_df.pop('game_id')
TEST_new_df.insert(0, 'game_id', first_column)

TEST_new_df['Goal Target'] = 0

i=0

while i < len(TEST_new_df):
    if (TEST_new_df['Event'][i] == 'Goal'):

        TEST_new_df.at[i-1, 'Goal Target'] = 1


    i += 1

TEST_new_df['Goal Target'].sum()

LB = LabelBinarizer()

labels = pd.DataFrame(LB.fit_transform(TEST_new_df['Event']), columns=LB.classes_)
TEST_new_df = pd.concat([TEST_new_df, labels], axis=1)

TEST_new_df.head()

TEST_new_df.to_csv('TEST_new_df.csv', index=False)

In [89]:
y2 = TEST_new_df.iloc[:, 22]
x2 = TEST_new_df.iloc[:, 23:34]
# x2 = x2.drop('Goal', 1)

x2.columns

Index(['Dump In/Out', 'Faceoff Win', 'Goal', 'Incomplete Play',
       'Penalty Taken', 'Play', 'Puck Recovery', 'Shot', 'Takeaway',
       'Zone Entry'],
      dtype='object')

------------------------------------------------------------------------------------------------------

### Split training and test sets

In [108]:
train, test = train_test_split(TEST_new_df, test_size = 0.3, random_state = 42)

# Logistic Regression using statsmodels

- LR in sklearn

In [119]:
train, test = train_test_split(TEST_new_df, test_size = 0.3, random_state = 42)
Y_train = train['Goal Target']
X_train = train[['Dump In/Out', 'Goal', 'Incomplete Play', 'Penalty Taken', 
           'Play', 'Puck Recovery', 'Shot', 'Takeaway', 'Zone Entry', 'Faceoff Win']]

LR = LogisticRegression()
LR.fit(X_train, Y_train)
print(LR.coef_)

[[-0.8864907  -0.08707315 -1.18195158 -0.14060616  1.10072918  0.66385079
   0.25150056 -0.16185597  0.98690904 -0.54562116]]


--------------

### using training set to fit model

In [121]:
Y_train = train['Goal Target']
X_train = train[['Dump In/Out', 'Goal', 'Incomplete Play', 'Penalty Taken', 
           'Play', 'Puck Recovery', 'Shot', 'Takeaway', 'Zone Entry', 'Faceoff Win']]

#??? error with "Faceoff Win" when using training set when test_size = 0.2
# no error when test_size = 0.3

log_reg = sm.Logit(Y_train, X_train).fit()

print(log_reg.summary())

         Current function value: 0.019329
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            Goal Target   No. Observations:                18817
Model:                          Logit   Df Residuals:                    18807
Method:                           MLE   Df Model:                            9
Date:                Sun, 05 Jun 2022   Pseudo R-squ.:                 0.04709
Time:                        19:18:20   Log-Likelihood:                -363.70
converged:                      False   LL-Null:                       -381.68
Covariance Type:            nonrobust   LLR p-value:                 4.053e-05
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Dump In/Out       -22.0364   1680.792     -0.013      0.990   -3316.328    3272.255
Goal              -17.4552    810.295     -0.022  



- performing predictions on the test datdaset

In [114]:
# defining the dependent and independent variables
Y_test = test['Goal Target']
X_test = test[['Dump In/Out', 'Goal', 'Incomplete Play', 'Penalty Taken', 
           'Play', 'Puck Recovery', 'Shot', 'Takeaway', 'Zone Entry', 'Faceoff Win']]

  
# performing predictions on the test datdaset
yhat = log_reg.predict(X_test)
prediction = list(map(round, yhat))
  
# comparing original and predicted values of y
#print('Actual values', list(Y_test.values))
#print('Predictions :', prediction)

- testing accuracy of model: (confusion matrix)

In [115]:
from sklearn.metrics import (confusion_matrix, accuracy_score)
  
# confusion matrix
cm = confusion_matrix(Y_test, prediction) 
print ("Confusion Matrix : \n", cm) 
  
# accuracy score of the model
print('Test accuracy = ', accuracy_score(Y_test, prediction))

Confusion Matrix : 
 [[8045    0]
 [  20    0]]
Test accuracy =  0.9975201487910725


---------

### using the full dataset to fit model

In [109]:
Y = TEST_new_df['Goal Target']
X = TEST_new_df[['Dump In/Out', 'Goal', 'Incomplete Play', 'Penalty Taken', 
           'Play', 'Puck Recovery', 'Shot', 'Takeaway', 'Zone Entry', 'Faceoff Win']]

# no issue with "Faceoff Win" here using full dataset

log_reg = sm.Logit(Y, X).fit()

print(log_reg.summary())

         Current function value: 0.018538
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            Goal Target   No. Observations:                26882
Model:                          Logit   Df Residuals:                    26872
Method:                           MLE   Df Model:                            9
Date:                Sun, 05 Jun 2022   Pseudo R-squ.:                 0.04513
Time:                        18:50:54   Log-Likelihood:                -498.34
converged:                      False   LL-Null:                       -521.90
Covariance Type:            nonrobust   LLR p-value:                 3.753e-07
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Dump In/Out       -23.6999   3245.317     -0.007      0.994   -6384.404    6337.004
Goal              -19.0545   1574.829     -0.012  



----------