# Expected goal modelisation

In [None]:
import os, sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)
from src.utils import split_dataset, evaluation_metrics, print_evaluation_metrics, plot_evaluation
import numpy
import pandas
import xgboost as xgb
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelBinarizer, StandardScaler
import matplotlib.pyplot as plt
import scikitplot as skplt

## Load and split data

In [None]:
data = pandas.read_csv("../data/train/shot_data.csv")
data.head()

In [None]:
x_train, y_train, x_test, y_test = split_dataset(data, 0.2, "is_goal")

## Feature Enginnering

In [None]:
numeric_features = ["minute", "second", "x_shot", "y_shot", "goal_distance"]
categorical_features = ["previous_type_name"]

In [None]:
feature_engineering = DataFrameMapper([
        (numeric_features, StandardScaler()),
        (categorical_features[0], LabelBinarizer())])

## Modelisation

In [None]:
model = xgb.XGBClassifier(n_estimators=100, scale_pos_weight=9, max_depth=10, random_state=42)

## Pipeline

In [None]:
pipeline = Pipeline(steps=[
            ("feature_engineering", feature_engineering),
            ("model", model)])

In [None]:
pipeline.fit(x_train[numeric_features + categorical_features], y_train)

## Evaluation

In [None]:
y_pred = pipeline.predict(x_test[numeric_features + categorical_features])
y_pred_proba = pipeline.predict_proba(x_test[numeric_features + categorical_features])

metrics = evaluation_metrics(y_test, y_pred)
print_evaluation_metrics(metrics)
plot_evaluation(y_test, y_pred, y_pred_proba)