In [None]:
import pandas as pd
import numpy as np
import json
import os
import swifter
import math
from statsbombpy import sb
#from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
#from sympy import symbols, Eq, solve
#import sympy as sp
#from scipy.integrate import quad
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, StackingRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn import tree, svm
from sklearn.preprocessing import StandardScaler

# Prepare and download data

## Download the data

In [None]:
json_list = os.listdir('three-sixty')
tracking_df = pd.DataFrame()
for i in json_list:
    # skip the json files which does not have the right format
    try:
        match_df = pd.read_json(f'three-sixty/{i}')
        tracking_df = pd.concat([tracking_df,match_df])
    except:
        print(i)
event_df = pd.DataFrame()
for i in json_list:
    tmp_event_df = sb.events(match_id = i[:-5])
    event_df = pd.concat([event_df,tmp_event_df])
tracking_df = tracking_df.explode("freeze_frame", ignore_index=True)
freeze_frame = pd.json_normalize(tracking_df["freeze_frame"]).add_prefix("freezeFrame_")
tracking_df = pd.concat([tracking_df, freeze_frame], axis=1)

## Data preparation

In [None]:
tracking_df['freezeFrame_x'] = tracking_df['freezeFrame_location'].swifter.apply(lambda x: x[0])
tracking_df['freezeFrame_y'] = tracking_df['freezeFrame_location'].apply(lambda x: x[1])
event_df['location_x'] = event_df['location'].swifter.apply(lambda x: x[0] if type(x) == list else np.nan)
event_df['location_y'] = event_df['location'].swifter.apply(lambda x: x[1] if type(x) == list else np.nan)



event_df['end_location'] = event_df['shot_end_location']
event_df['end_location'] = np.where(pd.isna(event_df['end_location'])!=False,
                            event_df['pass_end_location'], np.where(pd.isna(event_df['end_location'])!=False,
                            event_df['carry_end_location'], np.where(pd.isna(event_df['end_location'])!=False,
                            event_df['goalkeeper_end_location'], event_df['end_location'])))
event_df['end_location_x'] = event_df['end_location'].swifter.apply(lambda x: x[0] if type(x) == list else np.nan)
event_df['end_location_y'] = event_df['end_location'].swifter.apply(lambda x: x[1] if type(x) == list else np.nan)
event_df['end_location_z'] = event_df['end_location'].swifter.apply(lambda x: x[2] if type(x) == list and len(x)==3 else np.nan)

In [None]:
event_df = event_df.sort_values(by=['match_id', 'period', 'timestamp'])

In [None]:
event_df['goal_success'] = 0
for i in range(20):
    event_df['goal_success'] = np.where(((event_df['shot_outcome'].shift(-i) == 'Goal')&(event_df['play_pattern'].shift(-i) == event_df['play_pattern'])&(((pd.to_datetime(event_df['timestamp'])-pd.to_datetime(event_df['timestamp']).shift(-i)).dt.total_seconds())<=20)), 1, event_df['goal_success'])

In [None]:
event_df['pass_height_number'] = np.where(event_df['pass_height'] == 'High Pass', 2.2,
                        np.where(event_df['pass_height'] == 'Low Pass', 1.2,
                        np.where(event_df['pass_height'] == 'Ground Pass', 0.1, np.nan)))

In [None]:
event_df['pass_height_number'] = np.where(((event_df['pass_height_number'].isna())&(~event_df['end_location_z'].isna())),
                                          event_df['end_location_z'], event_df['pass_height_number'])

In [None]:
event_df['pass_height_number'] = event_df['pass_height_number'].fillna(event_df[event_df['shot_type']=='Free Kick']['pass_height_number'].mean())
event_df['pass_height_number'] = event_df['pass_height_number'].fillna(event_df[event_df['shot_type']=='Throw-in']['pass_height_number'].mean())

In [None]:
event_df['pass_length'] = np.where(((event_df['pass_length'].isna())),
                        (np.sqrt(((event_df['end_location_x']-event_df['location_x'])**2)+((event_df['end_location_y']-event_df['location_y'])**2))),
                        event_df['pass_length'])

In [None]:
zone_dict={15:[0,16.5,0,20],12:[16.5,45.5,0,20],9:[45.5,74.5,0,20],6:[74.5,103.5,0,20],3:[103.5,121,0,20],
           14:[0,16.5,20,31],11:[16.5,45.5,20,31],8:[45.5,74.5,20,31],5:[74.5,103.5,20,31],2:[103.5,121,20,31],
           13:[0,16.5,31,49],10:[16.5,45.5,31,49],7:[45.5,74.5,31,49],4:[74.5,103.5,31,49],1:[103.5,121,31,49],
           -14:[0,16.5,49,60],-11:[16.5,45.5,49,60],-8:[45.5,74.5,49,60],-5:[74.5,103.5,49,60],-2:[103.5,121,49,60],
           -15:[0,16.5,60,81],-12:[16.5,45.5,60,81],-9:[45.5,74.5,60,81],-6:[74.5,103.5,60,81],-3:[103.5,121,60,81]}

In [None]:
event_df['zone'] = 0
event_df['end_zone'] = 0
for key,value in zone_dict.items():
  event_df['zone'] = np.where(((event_df['location_x']>value[0])&(event_df['location_x']<=value[1])&(event_df['location_y']>value[2])&(event_df['location_y']<=value[3])),key,event_df['zone'])
  event_df['end_zone'] = np.where(((event_df['end_location_x']>value[0])&(event_df['end_location_x']<=value[1])&(event_df['end_location_y']>value[2])&(event_df['end_location_y']<=value[3])),key,event_df['end_zone'])

In [None]:
event_df['zone'] = event_df['zone'].abs()
event_df['end_zone'] = event_df['end_zone'].abs()

In [None]:
tracking_df['tmp_cnt']=1

In [None]:
event_df = event_df[(event_df['pass_type']=='Corner')|\
    (event_df['shot_type']=='Corner')|\
    ((event_df['shot_type']=='Free Kick')&(event_df['location_x']>=80))|\
    ((event_df['pass_type']=='Free Kick')&(event_df['location_x']>=80))|\
    ((event_df['pass_type']=='Throw-in')&(event_df['location_x']>=80))|\
    ((event_df['shot_type']=='Throw-in')&(event_df['location_x']>=80))]

### Density

In [None]:
event_id_list = list(event_df['id'].unique())
# working with only the needed rows to use less memory -> the code will be faster
tracking_df = tracking_df[tracking_df['event_uuid'].isin(event_id_list)]

In [None]:
def density(row):
    all_dens = tracking_df[tracking_df['event_uuid']==row['id']]
    
    all_dens['tmp'] = ((all_dens['freezeFrame_x'] - row['end_location_x'])**2 + (all_dens['freezeFrame_y'] - row['end_location_y'])**2)**0.5
    
    all_dens = all_dens[all_dens['tmp']<=5]
    
    return all_dens.tmp_cnt.astype("Int64").sum()
    

In [None]:
tracking_id_list = list(tracking_df['event_uuid'].unique())
event_df = event_df[event_df['id'].isin(tracking_id_list)]
event_df['density'] = event_df.swifter.apply(density, axis=1)

### Count the parabolic arc

In [None]:
event_df['arc'] = (4*event_df['pass_height_number']+np.sqrt(event_df['pass_length']**2 +16*event_df['pass_height_number']**2))/event_df['pass_length']
event_df['arc'] = np.log10(event_df['arc'])
event_df['arc'] = ((event_df['pass_length']**2)*event_df['arc'])/(8*(event_df['pass_height_number']**2))
event_df['arc'] = (0.5*np.sqrt(event_df['pass_length']**2 +16*event_df['pass_height_number']**2)) + event_df['arc']
event_df['arc']

# Visualizations

In [None]:
event_df['color'] = np.where(event_df['shot_outcome'] == 'Goal', 'red', np.where(event_df['goal_success']==1, 'orange', 'black'))

In [None]:
free_kicks_df = event_df[((event_df['shot_type']=='Free Kick')|(event_df['pass_type']=='Free Kick'))]

In [None]:
all_colors = list(event_df.color.unique())

In [None]:
fig, ax = plt.subplots()
handle_lits=[]
legend_list=[]
scatter_handles = []
for current_color in all_colors:
    print(current_color)
    current_df = free_kicks_df[free_kicks_df['color']==current_color]
    ax.scatter(current_df['location_x'], current_df['location_y'], c=current_color, 
                marker="o",
                sizes=current_df['shot_statsbomb_xg']*1000,
                )

img = plt.imread("half_pitch.png")
ax.imshow(img, extent=[60, 120, 0, 80])
ax.set_xlim(60, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
corners_df = event_df[((event_df['pass_type']=='Corner')|(event_df['shot_type']=='Corner'))]

In [None]:
fig, ax = plt.subplots()

for i, row in corners_df.iterrows():
    ax.annotate("", xy=(row['end_location_x'], row['end_location_y']), xytext=(row['location_x'], row['location_y']),
                arrowprops=dict(arrowstyle="->", color = row['color']))
    
img = plt.imread("half_pitch.png")
ax.imshow(img, extent=[60, 120, 0, 80])

ax.set_xlim(60, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
corners_right = corners_df[corners_df['location_y']<40]
corners_left = corners_df[corners_df['location_y']>=40]

In [None]:
fig, ax = plt.subplots()
for current_color in all_colors:
    current_df = corners_left[corners_left['color']==current_color]
    plt.scatter(current_df['end_location_x'], current_df['end_location_y'], c=current_color, 
                linewidths=2, 
                marker="o",
                )
img = plt.imread("half_pitch.png")
ax.imshow(img, extent=[60, 120, 0, 80])
ax.set_xlim(60, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
fig, ax = plt.subplots()
for current_color in all_colors:
    current_df = corners_right[corners_right['color']==current_color]
    plt.scatter(current_df['end_location_x'], current_df['end_location_y'], c=current_color, 
                linewidths=2, 
                marker="o",
                )
img = plt.imread("half_pitch.png")
ax.imshow(img, extent=[60, 120, 0, 80])
ax.set_xlim(60, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
throw_in_df = event_df[((event_df['pass_type']=='Throw-in')|(event_df['shot_type']=='Throw-in'))]

In [None]:
fig, ax = plt.subplots()

for current_color in all_colors:
    current_df = throw_in_df[throw_in_df['color']==current_color]
    for i, row in current_df.iterrows():
        ax.annotate("", xy=(row['end_location_x'], row['end_location_y']), xytext=(row['location_x'], row['location_y']),
                    arrowprops=dict(arrowstyle="->", color = current_color))
    
img = plt.imread("full_pitch.png")
ax.imshow(img, extent=[0, 120, 0, 80])

ax.set_xlim(0, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
throw_in_right = throw_in_df[throw_in_df['location_y']<40]
throw_in_left = throw_in_df[throw_in_df['location_y']>=40]

In [None]:
fig, ax = plt.subplots()
for current_color in all_colors:
    current_df = throw_in_left[throw_in_left['color']==current_color]
    plt.scatter(current_df['end_location_x'], current_df['end_location_y'], c=current_color, 
                linewidths=2, 
                marker="o",
                )
img = plt.imread("full_pitch.png")
ax.imshow(img, extent=[0, 120, 0, 80])
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)

plt.show()

In [None]:
fig, ax = plt.subplots()
for current_color in all_colors:
    current_df = throw_in_right[throw_in_right['color']==current_color]
    plt.scatter(current_df['end_location_x'], current_df['end_location_y'], c=current_color, 
                linewidths=2, 
                marker="o",
                )
img = plt.imread("full_pitch.png")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
ax.imshow(img, extent=[0, 120, 0, 80])
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)

plt.show()

# Modelling

## Throw ins

In [None]:
df_nan = throw_in_df[['goal_success', 'pass_height_number', 'pass_length',
      'zone', 'end_zone', 'density', 'arc']].dropna()
df_nan['pass_height_number'] = df_nan['pass_height_number'].astype('int64')
Train=df_nan
Xr_p=Train.drop(columns=['goal_success'])
yr_p=Train['goal_success']

In [None]:
sm = SMOTE(random_state=42, k_neighbors=3)
X_res, y_res = sm.fit_resample(Xr_p, yr_p)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
y_pred_p=clf.predict(X_test_scaled)
clf.predict_proba(X_test_scaled)
score=clf.score(X_train_scaled, y_train)
rmse=np.sqrt(metrics.mean_squared_error(y_test,y_pred_p))
r2_score=metrics.r2_score(y_test,y_pred_p)
log_loss=metrics.log_loss(y_test,y_pred_p)
f1 = f1_score(y_test, y_pred_p)
roc_a = roc_auc_score(y_test, y_pred_p)
print('score: ',score)
print('rmse: ',rmse)
print('r2_score: ',r2_score)
print('log_loss ',log_loss)
print('f1 score: ', f1)
print('roc auc score', roc_a)

importance=clf.coef_[0]

for val in enumerate(importance):
    print("importance score  : {} ".format(val))

In [None]:
confusion_matrix(y_test, y_pred_p)

In [None]:
crossV_accuracy = cross_val_score(clf, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(clf, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(clf, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)


predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(rf_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
xgbc_model = XGBClassifier()
xgbc_model.fit(X_train, y_train)

In [None]:
y_pred = xgbc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
crossV_accuracy = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(xgbc_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

predictions = dtc.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(dtc, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(dtc, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(dtc, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
predictions = svm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(svm_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(svm_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(svm_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
stck_model = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgbc_model), ('log reg', clf), ('decision tree', dtc), ('svm', svm_model)],
    final_estimator=xgbc_model
)
stck_model.fit(X_train, y_train)
stck_predictions = stck_model.predict(X_test)
accuracy = accuracy_score(y_test, stck_predictions)
r2_score = stck_model.score(X_test, stck_predictions)
f1 = f1_score(y_test, stck_predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, stck_predictions)

In [None]:
crossV_accuracy = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(stck_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
log_reg_importance=clf.coef_[0]
rf_importance = rf_model.feature_importances_
xgbc_importance = xgbc_model.feature_importances_
dtc_importance = dtc.feature_importances_
svm_importance = svm_model.coef_[0]
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'log_reg': log_reg_importance,
                                      'random_forest': rf_importance, 'XGBClassifier': xgbc_importance,
                                      'decision_tree': dtc_importance, 'svm': svm_importance
                                      })
feature_importance_df['log_reg'] = (feature_importance_df['log_reg'].abs())/(feature_importance_df['log_reg'].abs().sum())
feature_importance_df['svm'] = (feature_importance_df['svm'].abs())/(feature_importance_df['svm'].abs().sum())
feature_importance_df['feature_avg'] = np.mean([feature_importance_df['log_reg'].values, rf_importance, xgbc_importance,
                                                dtc_importance, feature_importance_df['svm'].values], axis=0)
feature_importance_df

## Free kicks

In [None]:
df_nan = free_kicks_df[['goal_success', 'pass_height_number', 'pass_length',
      'zone', 'end_zone', 'density', 'arc', 'shot_statsbomb_xg']].dropna()
df_nan['pass_height_number'] = df_nan['pass_height_number'].astype('int64')
Train=df_nan
Xr_p=Train.drop(columns=['goal_success'])
yr_p=Train['goal_success']

In [None]:
sm = SMOTE(random_state=42, k_neighbors=3)
X_res, y_res = sm.fit_resample(Xr_p, yr_p)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
y_pred_p=clf.predict(X_test_scaled)
clf.predict_proba(X_test_scaled)
score=clf.score(X_train_scaled, y_train)
rmse=np.sqrt(metrics.mean_squared_error(y_test,y_pred_p))
r2_score=metrics.r2_score(y_test,y_pred_p)
log_loss=metrics.log_loss(y_test,y_pred_p)
f1 = f1_score(y_test, y_pred_p)
roc_a = roc_auc_score(y_test, y_pred_p)
print('score: ',score)
print('rmse: ',rmse)
print('r2_score: ',r2_score)
print('log_loss ',log_loss)
print('f1 score: ', f1)
print('roc auc score', roc_a)

importance=clf.coef_[0]

for val in enumerate(importance):
    print("importance score  : {} ".format(val))

In [None]:
confusion_matrix(y_test, y_pred_p)

In [None]:
crossV_accuracy = cross_val_score(clf, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(clf, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(clf, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(rf_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
xgbc_model = XGBClassifier()
xgbc_model.fit(X_train, y_train)

In [None]:
y_pred = xgbc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
crossV_accuracy = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(xgbc_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

predictions = dtc.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(xgbc_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
predictions = svm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(dtc, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(dtc, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(dtc, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
stck_model = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgbc_model), ('log reg', clf), ('decision tree', dtc), ('svm', svm_model)],
    final_estimator=xgbc_model
)
stck_model.fit(X_train, y_train)
stck_predictions = stck_model.predict(X_test)
accuracy = accuracy_score(y_test, stck_predictions)
r2_score = stck_model.score(X_test, stck_predictions)
f1 = f1_score(y_test, stck_predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, stck_predictions)

In [None]:
crossV_accuracy = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(stck_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
log_reg_importance=clf.coef_[0]
rf_importance = rf_model.feature_importances_
xgbc_importance = xgbc_model.feature_importances_
dtc_importance = dtc.feature_importances_
svm_importance = svm_model.coef_[0]
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'log_reg': log_reg_importance,
                                      'random_forest': rf_importance, 'XGBClassifier': xgbc_importance,
                                      'decision_tree': dtc_importance, 'svm': svm_importance
                                      })
feature_importance_df['log_reg'] = (feature_importance_df['log_reg'].abs())/(feature_importance_df['log_reg'].abs().sum())
feature_importance_df['svm'] = (feature_importance_df['svm'].abs())/(feature_importance_df['svm'].abs().sum())
feature_importance_df['feature_avg'] = np.mean([feature_importance_df['log_reg'].values, rf_importance, xgbc_importance,
                                                dtc_importance, feature_importance_df['svm'].values], axis=0)
feature_importance_df

## Corners

In [None]:
df_nan = corners_df[['goal_success', 'pass_height_number', 'pass_length',
      'zone', 'end_zone', 'density', 'arc']].dropna()
df_nan['pass_height_number'] = df_nan['pass_height_number'].astype('int64')
Train=df_nan
Xr_p=Train.drop(columns=['goal_success'])
yr_p=Train['goal_success']

In [None]:
sm = SMOTE(random_state=42, k_neighbors=3)
X_res, y_res = sm.fit_resample(Xr_p, yr_p)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
y_pred_p=clf.predict(X_test_scaled)
clf.predict_proba(X_test_scaled)
score=clf.score(X_train_scaled, y_train)
rmse=np.sqrt(metrics.mean_squared_error(y_test,y_pred_p))
r2_score=metrics.r2_score(y_test,y_pred_p)
log_loss=metrics.log_loss(y_test,y_pred_p)
f1 = f1_score(y_test, y_pred_p)
roc_a = roc_auc_score(y_test, y_pred_p)
print('score: ',score)
print('rmse: ',rmse)
print('r2_score: ',r2_score)
print('log_loss ',log_loss)
print('f1 score: ', f1)
print('roc auc score', roc_a)

importance=clf.coef_[0]

for val in enumerate(importance):
    print("importance score  : {} ".format(val))

In [None]:
confusion_matrix(y_test, y_pred_p)

In [None]:
crossV_accuracy = cross_val_score(clf, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(clf, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(clf, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

predictions = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(rf_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(rf_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
xgbc_model = XGBClassifier()
xgbc_model.fit(X_train, y_train)

In [None]:
y_pred = xgbc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
crossV_accuracy = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(xgbc_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

predictions = dtc.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(xgbc_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(xgbc_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
predictions = svm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
crossV_accuracy = cross_val_score(svm_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(svm_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(svm_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
stck_model = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgbc_model), ('log reg', clf), ('decision tree', dtc), ('svm', svm_model)],
    final_estimator=xgbc_model
)
stck_model.fit(X_train, y_train)
stck_predictions = stck_model.predict(X_test)
accuracy = accuracy_score(y_test, stck_predictions)
r2_score = stck_model.score(X_test, stck_predictions)
f1 = f1_score(y_test, stck_predictions)
print('accuracy: ',accuracy)
print('f1 score: ',f1)

In [None]:
confusion_matrix(y_test, stck_predictions)

In [None]:
crossV_accuracy = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='accuracy')
crossV_f1 = cross_val_score(stck_model, X_res, y_res, cv=5, scoring='f1')
crossV_predict = cross_val_predict(stck_model, X_res, y_res, cv=5, method='predict')
print('accuracy: ', crossV_accuracy)
print('f1 score: ', crossV_f1)

In [None]:
log_reg_importance=clf.coef_[0]
rf_importance = rf_model.feature_importances_
xgbc_importance = xgbc_model.feature_importances_
dtc_importance = dtc.feature_importances_
svm_importance = svm_model.coef_[0]
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'log_reg': log_reg_importance,
                                      'random_forest': rf_importance, 'XGBClassifier': xgbc_importance,
                                      'decision_tree': dtc_importance, 'svm': svm_importance
                                      })
feature_importance_df['log_reg'] = (feature_importance_df['log_reg'].abs())/(feature_importance_df['log_reg'].abs().sum())
feature_importance_df['svm'] = (feature_importance_df['svm'].abs())/(feature_importance_df['svm'].abs().sum())
feature_importance_df['feature_avg'] = np.mean([feature_importance_df['log_reg'].values, rf_importance, xgbc_importance,
                                                dtc_importance, feature_importance_df['svm'].values], axis=0)
feature_importance_df