
## Import 

In [None]:
import os, sys, pickle

import requests

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()
import plotly.express as px

from sklearn.model_selection import * 
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.impute import * 
from sklearn.preprocessing import * 
from sklearn.ensemble import * 
# from xgboost import XGBClassifier



# import shap

## Load data

In [None]:
url = "./data/cleaned/df.csv"

In [None]:
df = pd.read_csv (url)

In [None]:
df.head()

## Prepare

In [None]:
df.head()

In [None]:
X = df.drop(columns="Survived")
y = df.Survived

In [None]:
df_train, df_test  = train_test_split(df)

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
X_train_PassengerId = df_train.PassengerId.values
y_train = df_train.Survived.values
X_train = df_train.drop(columns=["Survived","PassengerId" ], inplace=False)
X_train.head()

In [None]:
y_train[:10]

In [None]:
len(y_train)

In [None]:
len(X_train)

In [None]:
X_test_PassengerId = df_test.PassengerId.values
y_test = df_test.Survived.values
X_test = df_test.drop(columns=["Survived","PassengerId" ], inplace=False)
X_test.head()

## Model

In [None]:
imputer = KNNImputer().fit(X_train)

X_train = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)
X_train.info()

In [None]:
grid = GridSearchCV(RandomForestClassifier(), {}, cv=10, return_train_score=True, n_jobs=-1)

## Train

In [None]:
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if ("split" not in i) ]
res = res.loc[:, cols]
res

## Test

In [None]:
grid.score(X_train, y_train)

In [None]:
model = grid.best_estimator_
model

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

# export

In [None]:
with open("./models/model.pk", "wb") as f : 
    pk = pickle.dumps(model)
    f.write(pk)

In [None]:
from IPython.display import display

In [None]:
display(len(X_train))
display(len(X_train_PassengerId))
display(len(y_train))

In [None]:
len(y_train)

In [None]:
X_train["Survived"] = y_train
X_train["PassengerId"] = X_train_PassengerId
X_train.head()

In [None]:
X_train.isna().sum()

In [None]:
X_train.to_csv("./data/cleaned/df_train.csv", index=False)

In [None]:
X_test["Survived"] = y_test
X_test["PassengerId"] = X_test_PassengerId
X_test.head()

In [None]:
X_test.isna().sum()

In [None]:
X_test.to_csv("./data/cleaned/df_test.csv", index=False)

## Explain

In [None]:
estimator = model
feat = pd.Series(estimator.feature_importances_, index=X_test.columns)# .sort_values(ascending=False)
px.bar(feat)

In [None]:
estimator = model
feat = pd.Series(estimator.feature_importances_, index=X_test.columns).sort_values(ascending=False)
px.bar(feat)

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar",show=True)

In [None]:
"""
shap.summary_plot(shap_values, X_train, plot_type="bar",show=False)
plt.savefig('grafic.png')
"""

In [None]:
"""
shap.plots.waterfall(shap_values[0])
"""

In [None]:
"""

explainer = shap.TreeExplainer(model, X_train)
shap_values = explainer(X_train)
shap.plots.waterfall(shap_values[0])
"""

In [None]:
shap.plots.waterfall(shap_values[0])

## Save