In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import metrics

import shap
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("flights_red.csv")
data = data.sample(frac = 0.1, random_state=10)

data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)

data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1

cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1
 
# Split entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
                                                random_state=10, test_size=0.25)

# Split em validação e treino final. Utilizado para validação cruzada no xgboost
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

In [None]:
model = xgb.XGBClassifier(max_depth=10,
                           min_child_weight=1,
                           learning_rate=0.1,
                           n_estimators=100,
                           silent=False,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None)

eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True, early_stopping_rounds=50)

auc(model, X_train, X_test)

# observando logloss validacao

In [None]:
for key in model.evals_result().keys():
    pd.DataFrame(model.evals_result()[key]['logloss']).plot(kind='line')

# Feature Importance

In [None]:
import matplotlib.pyplot as plt

xgb.plot_importance(model)

# Shap Values

In [None]:
# carregando javascript
shap.initjs()

In [None]:
# demora um pouco, rodando shap values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)


In [None]:
# plot para uma predicao
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [None]:
# Summary plot
shap.summary_plot(shap_values, X_test)