In [100]:
# Import modules
# Data preprocessing

import statsbomb as sb
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline
import seaborn as sns
#import scikitplot as skplt

# Machine learning
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 


In [9]:
# Get a list of the available competitions
comps = sb.Competitions()

# Get underlying json data
json_data = comps.data

# Convert to a pandas dataframe
df = comps.get_dataframe()
df

Unnamed: 0,competition_id,competition_name,country_name,match_available,match_updated,season_id,season_name
0,37,FA Women's Super League,England,2019-04-20T13:46:55.933080,2019-04-20T13:46:55.933080,4,2018/2019
1,43,FIFA World Cup,International,2019-02-25T16:37:57.785859,2019-02-25T16:37:57.785859,3,2018
2,49,NWSL,United States of America,2019-02-25T16:37:57.785859,2019-04-23T15:34:53.624264,3,2018


In [15]:
fawsl = sb.Matches('37').get_dataframe()
nswl = sb.Matches('49').get_dataframe()

data = pd.concat([fawsl,nswl])
data.tail()

Unnamed: 0,away_score,away_team,competition,data_version,home_score,home_team,kick_off,last_updated,match_date,match_id,match_status,referee_name,season,stadium_name
29,3,761,49,1.0.2,1,763,01:00:00.000,2019-02-25T16:37:57.785859,2018-07-08,7476,available,J. Peralta,3,Yurcak Field
30,0,759,49,1.0.2,2,767,03:00:00.000,2019-02-25T16:37:57.785859,2018-05-06,7445,available,F. Benchabane,3,Groupama Stadium
31,4,766,49,1.0.2,3,764,01:30:00.000,2019-02-25T16:37:57.785859,2018-05-24,7457,available,,3,
32,2,767,49,1.0.2,1,763,01:00:00.000,2019-02-25T16:37:57.785859,2018-06-03,7521,available,R. Touchan,3,Stade Sébastien-Charléty
33,2,761,49,1.0.2,0,759,03:00:00.000,2019-02-25T16:37:57.785859,2018-06-03,7520,available,N. Kinseley,3,Stade Gaston Petit


In [17]:
len(data)

127

In [22]:
%time
shots_df = pd.DataFrame()

for i in data['match_id']:
    events = sb.Events(event_id=i)
    df = events.get_dataframe(event_type='shot')
    shots_df = shots_df.append(df)

shots_df.head()
    


CPU times: user 8 µs, sys: 6 µs, total: 14 µs
Wall time: 30 µs


Unnamed: 0,event_type,id,index,period,timestamp,minute,second,possession,possession_team,play_pattern,...,follows_dribble,redirect,one_on_one,open_goal,deflected,start_location_x,start_location_y,end_location_x,end_location_y,end_location_z
0,shot,76e721e0-de7e-47f3-afe4-22a560ad1112,174,1,00:04:26.290,4,26,7,Manchester City WFC,Regular Play,...,True,,,,,101.0,29.0,117.0,39.3,0.4
1,shot,70c14db1-b244-4677-a56c-bc3011cb01eb,269,1,00:08:06.827,8,6,16,Everton LFC,From Corner,...,,,,,,97.0,51.0,99.0,50.0,
2,shot,0f7f5a17-2085-4278-9cd6-9dd5a848ec17,334,1,00:10:46.495,10,46,22,Everton LFC,Regular Play,...,,,,,,94.0,38.0,119.0,44.9,0.1
3,shot,8dc2ffd4-cc09-4507-af60-04dc1310da44,346,1,00:11:17.596,11,17,24,Everton LFC,Regular Play,...,,,,True,,115.0,42.0,120.0,41.6,0.6
4,shot,da4ade9f-63ad-42dd-a407-a68d0b1319a3,375,1,00:12:47.733,12,47,27,Manchester City WFC,From Free Kick,...,,,,,,112.0,40.0,118.0,40.5,2.0


In [23]:
len(shots_df)

3442

In [25]:
print(shots_df.columns)

Index(['event_type', 'id', 'index', 'period', 'timestamp', 'minute', 'second',
       'possession', 'possession_team', 'play_pattern', 'off_camera', 'team',
       'player', 'position', 'duration', 'under_pressure', 'statsbomb_xg',
       'key_pass_id', 'body_part', 'type', 'outcome', 'technique',
       'first_time', 'follows_dribble', 'redirect', 'one_on_one', 'open_goal',
       'deflected', 'start_location_x', 'start_location_y', 'end_location_x',
       'end_location_y', 'end_location_z'],
      dtype='object')


In [27]:
print(shots_df['event_type'].unique())

['shot']


In [29]:
print(shots_df['play_pattern'].unique())

['Regular Play' 'From Corner' 'From Free Kick' 'From Throw In'
 'From Goal Kick' 'From Counter' 'Other' 'From Kick Off' 'From Keeper']


In [31]:
print(shots_df['type'].unique())

['Open Play' 'Free Kick' 'Penalty']


In [35]:
#Removing penalties

np_shots = shots_df[shots_df['type']!='Penalty']
print(np_shots['type'].unique())

['Open Play' 'Free Kick']


In [38]:
np_shots['outcome'].unique()

array(['Saved', 'Blocked', 'Off T', 'Goal', 'Wayward', 'Post', None],
      dtype=object)

In [41]:
#creating a column with goals
np_shots['Goal'] = [ 1 if x =='Goal' else 0 for x in np_shots['outcome']]
np_shots['Goal']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0     0
1     0
2     0
3     1
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    1
14    0
15    1
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    1
28    0
0     0
     ..
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     1
8     0
9     0
10    0
11    0
12    1
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
Name: Goal, Length: 3413, dtype: int64

In [50]:
#feature engeeing for conversion
total_attempts = len(np_shots)
convert = len(np_shots[np_shots['Goal']==1])
conversion = convert/total_attempts*100
conversion

9.932610606504541

In [65]:
#Feature engineering 
np_shots['key_pass_id'].head()

0    a3e2394d-2ba5-4786-9411-3dcb5c5eca27
1                                    None
2    0062b0b7-d0b3-499c-9d40-816edd5e855f
3                                    None
4    18b14e45-3b84-4d1e-a3be-97953fcc5fee
Name: key_pass_id, dtype: object

In [74]:
np_shots['assisted'] = [1 if x !=None else 0 for x in np_shots['key_pass_id']]
np_shots['assisted'].head()

0    1
1    0
2    1
3    0
4    1
Name: assisted, dtype: int64

In [75]:
np_shots.columns

Index(['event_type', 'id', 'index', 'period', 'timestamp', 'minute', 'second',
       'possession', 'possession_team', 'play_pattern', 'off_camera', 'team',
       'player', 'position', 'duration', 'under_pressure', 'statsbomb_xg',
       'key_pass_id', 'body_part', 'type', 'outcome', 'technique',
       'first_time', 'follows_dribble', 'redirect', 'one_on_one', 'open_goal',
       'deflected', 'start_location_x', 'start_location_y', 'end_location_x',
       'end_location_y', 'end_location_z', 'Goal', 'assisted'],
      dtype='object')

In [73]:
np_shots = np_shots.reset_index().drop('level_0',axis=1)

In [81]:
#Creating new variables
np_shots['X_distance'] = 120-(np_shots['start_location_x'])
np_shots['y_distance'] = abs(40 - (np_shots['start_location_y']))
np_shots['distance'] = np.sqrt((np_shots['X_distance'])**2+(np_shots['y_distance'])**2)
np_shots.head()

Unnamed: 0,event_type,id,index,period,timestamp,minute,second,possession,possession_team,play_pattern,...,start_location_x,start_location_y,end_location_x,end_location_y,end_location_z,Goal,assisted,X_distance,y_distance,distance
0,shot,76e721e0-de7e-47f3-afe4-22a560ad1112,174,1,00:04:26.290,4,26,7,Manchester City WFC,Regular Play,...,101.0,29.0,117.0,39.3,0.4,0,1,19.0,11.0,21.954498
1,shot,70c14db1-b244-4677-a56c-bc3011cb01eb,269,1,00:08:06.827,8,6,16,Everton LFC,From Corner,...,97.0,51.0,99.0,50.0,,0,0,23.0,11.0,25.495098
2,shot,0f7f5a17-2085-4278-9cd6-9dd5a848ec17,334,1,00:10:46.495,10,46,22,Everton LFC,Regular Play,...,94.0,38.0,119.0,44.9,0.1,0,1,26.0,2.0,26.07681
3,shot,8dc2ffd4-cc09-4507-af60-04dc1310da44,346,1,00:11:17.596,11,17,24,Everton LFC,Regular Play,...,115.0,42.0,120.0,41.6,0.6,1,0,5.0,2.0,5.385165
4,shot,da4ade9f-63ad-42dd-a407-a68d0b1319a3,375,1,00:12:47.733,12,47,27,Manchester City WFC,From Free Kick,...,112.0,40.0,118.0,40.5,2.0,0,1,8.0,0.0,8.0


In [84]:
np_shots['angle']=np.degrees(np.arctan((np_shots['y_distance'] / np_shots['X_distance'])))
np_shots.head()

Unnamed: 0,event_type,id,index,period,timestamp,minute,second,possession,possession_team,play_pattern,...,start_location_y,end_location_x,end_location_y,end_location_z,Goal,assisted,X_distance,y_distance,distance,angle
0,shot,76e721e0-de7e-47f3-afe4-22a560ad1112,174,1,00:04:26.290,4,26,7,Manchester City WFC,Regular Play,...,29.0,117.0,39.3,0.4,0,1,19.0,11.0,21.954498,30.068583
1,shot,70c14db1-b244-4677-a56c-bc3011cb01eb,269,1,00:08:06.827,8,6,16,Everton LFC,From Corner,...,51.0,99.0,50.0,,0,0,23.0,11.0,25.495098,25.559965
2,shot,0f7f5a17-2085-4278-9cd6-9dd5a848ec17,334,1,00:10:46.495,10,46,22,Everton LFC,Regular Play,...,38.0,119.0,44.9,0.1,0,1,26.0,2.0,26.07681,4.398705
3,shot,8dc2ffd4-cc09-4507-af60-04dc1310da44,346,1,00:11:17.596,11,17,24,Everton LFC,Regular Play,...,42.0,120.0,41.6,0.6,1,0,5.0,2.0,5.385165,21.801409
4,shot,da4ade9f-63ad-42dd-a407-a68d0b1319a3,375,1,00:12:47.733,12,47,27,Manchester City WFC,From Free Kick,...,40.0,118.0,40.5,2.0,0,1,8.0,0.0,8.0,0.0


In [87]:
# Select model features and labels
feature_cols = ['play_pattern', 'under_pressure','technique', 'first_time',
                'follows_dribble', 'redirect', 'one_on_one', 'open_goal', 'deflected',
                'assisted', 'distance', 'angle']

features = np_shots[feature_cols]
labels = np_shots['Goal']

features = features.fillna(0)
labels = labels.fillna(0)

In [90]:
features.head()

Unnamed: 0,play_pattern,under_pressure,technique,first_time,follows_dribble,redirect,one_on_one,open_goal,deflected,assisted,distance,angle
0,Regular Play,0,Normal,0,True,0,0,0,0,1,21.954498,30.068583
1,From Corner,0,Normal,0,0,0,0,0,0,0,25.495098,25.559965
2,Regular Play,0,Normal,0,0,0,0,0,0,1,26.07681,4.398705
3,Regular Play,0,Half Volley,True,0,0,0,True,0,0,5.385165,21.801409
4,From Free Kick,0,Normal,0,0,0,0,0,0,1,8.0,0.0


In [92]:
#for categorical feature
cat_cols = ['play_pattern', 'under_pressure', 'technique', 'first_time',
                'follows_dribble', 'redirect', 'one_on_one', 'open_goal', 'deflected']

cat_features = features[cat_cols]
features = features.drop(cat_cols, axis=1)

# Use label encoding to convert categorical features to numeric
le = preprocessing.LabelEncoder()
cat_features = cat_features.apply(le.fit_transform)

# Merge with numeric features
features = features.merge(cat_features, left_index=True, right_index=True)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, shuffle=True, random_state=42)

# Scale X so as to minimize error
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)



  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [108]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()

classifier = [('DecisionTree',dt), ('Logistic Reg',lr), ('K nearest',knn)]

for cl_name,clf in classifier:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:,1]
    print('for :' + cl_name)
    print('Confusion_matrix \n')
    print(confusion_matrix(y_test,y_pred))
    print('Classification report\n')
    print(classification_report(y_test,y_pred))
    print('accuracy score:', accuracy_score(y_test,y_pred)*100)
    print("Predicted goals (test set):", sum(y_pred))
    print("Sum of predicted goal probabilities (aka xG):", "{0:.2f}".format(sum(y_pred_prob)))
    print("Actual goals (test set):", sum(y_test))
    print('')


    

for :DecisionTree
Confusion_matrix 

[[561  55]
 [ 57  10]]
Classification report

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       616
           1       0.15      0.15      0.15        67

   micro avg       0.84      0.84      0.84       683
   macro avg       0.53      0.53      0.53       683
weighted avg       0.83      0.84      0.83       683

accuracy score: 83.601756954612
Predicted goals (test set): 65
Sum of predicted goal probabilities (aka xG): 75.97
Actual goals (test set): 67

for :Logistic Reg
Confusion_matrix 

[[615   1]
 [ 58   9]]
Classification report

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       616
           1       0.90      0.13      0.23        67

   micro avg       0.91      0.91      0.91       683
   macro avg       0.91      0.57      0.59       683
weighted avg       0.91      0.91      0.88       683

accuracy score: 91.36163982430455
Pr

