In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

In [2]:
df = pd.read_csv('Cleaned_shot_data_01-2020--04-2021.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,shot_type,shot_description,secondary_shot_type,game_date,opponent,player_name,player_id,period,period_time,shot_x_coordinate,shot_y_coordinate,home_game,head_coach,Location_Cluster,game_date_opponent,period_time_seconds,shifted_period_time,time_between_shots,rebound,goal_scored
0,Blocked Shot,Ryan Lindgren shot blocked shot by Matt Martin,,2021-01-14,New York Islanders,Ryan Lindgren,8479324,1,00:32,-72.0,10.0,Away,David Quinn,Right Circle,2021-01-14 vs. New York Islanders,32.0,63.0,31,0,0
1,Missed Shot,Artemi Panarin Wide of Net,,2021-01-14,New York Islanders,Artemi Panarin,8478550,1,01:03,-86.0,-11.0,Away,David Quinn,Slot,2021-01-14 vs. New York Islanders,63.0,247.0,184,0,0
2,Shot,Filip Chytil Wrist Shot saved by Semyon Varlamov,Wrist Shot,2021-01-14,New York Islanders,Filip Chytil,8480078,1,04:07,-84.0,6.0,Away,David Quinn,Slot,2021-01-14 vs. New York Islanders,247.0,330.0,83,0,0
3,Shot,Mika Zibanejad Slap Shot saved by Semyon Varlamov,Slap Shot,2021-01-14,New York Islanders,Mika Zibanejad,8476459,1,05:30,-70.0,17.0,Away,David Quinn,Right Circle,2021-01-14 vs. New York Islanders,330.0,441.0,111,0,0
4,Shot,Filip Chytil Wrist Shot saved by Semyon Varlamov,Wrist Shot,2021-01-14,New York Islanders,Filip Chytil,8480078,1,07:21,-60.0,-21.0,Away,David Quinn,Left Circle,2021-01-14 vs. New York Islanders,441.0,451.0,10,0,0


In [13]:
player_names = [player for player in df.player_name.unique()]
clusters = [cluster for cluster in df.Location_Cluster.unique()]
num_of_players = len(player_names)

shots_array = np.array([["shooter", "cluster", 'num_shots', "num_makes", "expected_val",
                         "Player_p", "Team_p", "Location_p"]])

for player in player_names:
    player_shots = df[df['player_name'] == player]

    for cluster in player_shots['Location_Cluster'].unique():
        player_shot_in_cluster = player_shots[player_shots['Location_Cluster'] == cluster]
        num_of_shots = len(player_shot_in_cluster)
        goals_scored = player_shot_in_cluster['goal_scored'].sum()
        
        expected_value_goal = goals_scored/num_of_shots
        # Player p-value
        avg_goal_per_shot = player_shots.goal_scored.sum() / len(player_shots)

        mu = num_of_shots * avg_goal_per_shot
        sigma = np.sqrt(mu*(1-avg_goal_per_shot))
        player_p = 1-norm.cdf(goals_scored, loc=mu, scale=sigma)
    
        # Team p-value
        avg_team_goal_per_shot = df.goal_scored.sum() / len(df)
        mu = num_of_shots * avg_team_goal_per_shot
        sigma = np.sqrt(mu*(1-avg_team_goal_per_shot))
        team_p = 1-norm.cdf(goals_scored, loc=mu, scale=sigma)
        
        # Location p-value
        avg_goal_per_location = df[df['Location_Cluster'] == cluster].goal_scored.sum() / len(df[df['Location_Cluster'] == cluster])
        mu = num_of_shots * avg_goal_per_location
        sigma = np.sqrt(mu*(1-avg_goal_per_location))
        location_p = 1-norm.cdf(goals_scored, loc=mu, scale=sigma)
        
        shots_array = np.append(shots_array, [[player, cluster, num_of_shots, goals_scored,
                                              expected_value_goal, player_p, team_p, location_p]], axis=0)
        



In [25]:
df_2 = pd.DataFrame(shots_array)
df_2.columns = df_2.iloc[0]
df_2 = df_2.iloc[1:, :]

In [29]:
df_2

Unnamed: 0,shooter,cluster,num_shots,num_makes,expected_val,Player_p,Team_p,Location_p
1,Ryan Lindgren,Right Circle,6,0,0.0,,0.7336561423302342,0.7083954461003916
2,Ryan Lindgren,Left Circle,17,0,0.0,,0.8531856281090275,0.8395785362865366
3,Ryan Lindgren,Slot,21,0,0.0,,0.8784404061775957,0.943106324212207
4,Ryan Lindgren,High Slot,9,0,0.0,,0.7776047143547695,0.7126859647052084
5,Ryan Lindgren,Right Point,5,0,0.0,,0.7155078062743347,0.5626613093282291
...,...,...,...,...,...,...,...,...
139,Libor Hajek,Left Point,5,0,0.0,0.6536836079790199,0.7155078062743347,0.6322689280834715
140,Libor Hajek,Left Circle,1,0,0.0,0.5701581024006669,0.6005262821729567,0.595133970103405
141,Tarmo Reunanen,Left Point,1,0,0.0,,0.6005262821729567,0.5600513752245466
142,Vitali Kravtsov,Left Point,2,0,0.0,,0.640656337601732,0.5846043945757227


In [32]:
numeric_cols = df_2.columns[-3:]

In [33]:
numeric_cols

Index(['Player_p', 'Team_p', 'Location_p'], dtype='object', name=0)

In [39]:
df_2 = df_2[df_2['Player_p'] != 'nan']

In [40]:
df_2[numeric_cols] = df_2[numeric_cols].apply(pd.to_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [44]:
df_2[df_2['Player_p'] <= 0.05].sort_values('expected_val', ascending=False)

Unnamed: 0,shooter,cluster,num_shots,num_makes,expected_val,Player_p,Team_p,Location_p
133,Jonny Brodzinski,Slot,1,1,1.0,0.007153,4.3e-05,0.001878935
102,Pavel Buchnevich,Left Circle,15,4,0.2666666666666666,0.012651,0.000432,0.0001556651
38,Alexis Lafrenière,Left Circle,13,3,0.2307692307692307,0.010269,0.00523,0.002651859
13,Filip Chytil,Slot,27,6,0.2222222222222222,0.027443,0.000229,0.02550275
7,Artemi Panarin,Slot,35,6,0.1714285714285714,0.030831,0.003136,0.1061404
44,K'Andre Miller,Right Point,6,1,0.1666666666666666,0.039265,0.139432,8.310082e-09
47,K'Andre Miller,High Slot,13,2,0.1538461538461538,0.009703,0.080653,0.00832753
49,Adam Fox,Slot,29,3,0.1034482758620689,0.016995,0.169187,0.5206527
34,Jacob Trouba,High Slot,16,1,0.0625,0.008281,0.489489,0.2627972


What the chart above tells me is that Buchnevich and Lafreniere shoot best from the left circle, and that Buchy and Laf's expected goal value of .27 and .23 respectively are in fact statistically significant.
Chytil, Panarin and Fox all shoot their best shots from the slot, and their expected values from the slot are statistically significant.
Millers shots from the Right Point and Left Points expected values are both statistically significant.
Trouba's shots from the High Slot are his best shots, and his expected value from this area is statistically significant.

In [45]:
df_2[df_2['Team_p'] <= 0.05].sort_values('expected_val', ascending=False)

Unnamed: 0,shooter,cluster,num_shots,num_makes,expected_val,Player_p,Team_p,Location_p
133,Jonny Brodzinski,Slot,1,1,1.0,0.007153,4.3e-05,0.001879
89,Ryan Strome,Left Point,3,1,0.3333333333333333,0.051831,0.024271,0.000133
102,Pavel Buchnevich,Left Circle,15,4,0.2666666666666666,0.012651,0.000432,0.000156
38,Alexis Lafrenière,Left Circle,13,3,0.2307692307692307,0.010269,0.00523,0.002652
13,Filip Chytil,Slot,27,6,0.2222222222222222,0.027443,0.000229,0.025503
120,Colin Blackwell,Slot,27,6,0.2222222222222222,0.073001,0.000229,0.025503
67,Chris Kreider,Slot,77,14,0.1818181818181818,0.082383,5e-06,0.015948
7,Artemi Panarin,Slot,35,6,0.1714285714285714,0.030831,0.003136,0.10614
62,Kevin Rooney,Slot,26,4,0.1538461538461538,0.169301,0.023802,0.216427
88,Ryan Strome,Slot,61,7,0.1147540983606557,0.153266,0.0394,0.416351


What the dataframe above tells me is that Ryan Strome's shots from the left point have an expected value that is statistically significant when compared to the teams overall expected goals from a shot anywhere. In other words Stromer is shooting better from the Left Point than the team shoots on average from anywhere.

The same could then be said about Buchnevich and Lafreniere shooting from the left circle, and Chytil, Blackwell, Kreider, Panarin, Rooney and Strome from the slot

In [46]:
df_2[df_2['Location_p'] <= 0.05].sort_values('expected_val', ascending=False)

Unnamed: 0,shooter,cluster,num_shots,num_makes,expected_val,Player_p,Team_p,Location_p
133,Jonny Brodzinski,Slot,1,1,1.0,0.007153,4.3e-05,0.001878935
89,Ryan Strome,Left Point,3,1,0.3333333333333333,0.051831,0.024271,0.0001329079
102,Pavel Buchnevich,Left Circle,15,4,0.2666666666666666,0.012651,0.000432,0.0001556651
38,Alexis Lafrenière,Left Circle,13,3,0.2307692307692307,0.010269,0.00523,0.002651859
13,Filip Chytil,Slot,27,6,0.2222222222222222,0.027443,0.000229,0.02550275
120,Colin Blackwell,Slot,27,6,0.2222222222222222,0.073001,0.000229,0.02550275
67,Chris Kreider,Slot,77,14,0.1818181818181818,0.082383,5e-06,0.01594762
44,K'Andre Miller,Right Point,6,1,0.1666666666666666,0.039265,0.139432,8.310082e-09
47,K'Andre Miller,High Slot,13,2,0.1538461538461538,0.009703,0.080653,0.00832753
107,Pavel Buchnevich,High Slot,13,2,0.1538461538461538,0.240905,0.080653,0.00832753


The dataframe above tells me that Strome, Fox, Kakko and Kreider are shooting better than the average shot% from the Left Point, Buch, and Laf are shooting better than the team average shot% from the left circle, Chytil, Blackwell and Kreids are shooting better than anyone else on the team from the slow, Buch and K'Andre are shooting better than the team average from the High Slot, and K'Ander is shooting better than the team average from the Right Point.