
## Import 

In [None]:
cd ..

In [None]:
pwd

In [None]:
import os, sys, pickle

from IPython.display import display

import requests

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()
import plotly.express as px

from sklearn.model_selection import * 
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.impute import * 
from sklearn.preprocessing import * 
from sklearn.ensemble import * 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier

from category_encoders import TargetEncoder

# import shap

## Load data

In [None]:
cleaned = "./data/cleaned/"

In [None]:
df = pd.read_csv(cleaned+"df.csv")

In [None]:
df.head()

## Prepare

In [None]:
df.head()

In [None]:
X = df.drop(columns="Survived")
y = df.Survived

In [None]:
df_train, df_test  = train_test_split(df)

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
y_train = df_train.Survived.values

In [None]:
# X_train_PassengerId = df_train.PassengerId.values

X_train = df_train.drop(columns=["Survived", ], inplace=False)
X_train.head()

In [None]:
y_train[:10]

In [None]:
len(y_train)

In [None]:
len(X_train)

In [None]:
y_test = df_test.Survived.values


In [None]:
# X_test_PassengerId = df_test.PassengerId.values
X_test = df_test.drop(columns=["Survived" ], inplace=False)
X_test.head()

In [None]:
X_train.select_dtypes(include=np.number).columns

In [None]:
X_train.select_dtypes(exclude=np.number).columns

## Model

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num", "passthrough",['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']),
    ("oneHot", OneHotEncoder( handle_unknown="ignore"), ['Sex', 'Embarked'])
    ], remainder="drop")

In [None]:
X_train.head(10)

In [None]:
tmp = preprocessor.fit_transform(X_train)
pd.DataFrame(tmp).head(10)

In [None]:
imputer = KNNImputer()

# X_train = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
# X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)
# X_train.info()

In [None]:
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("imputer",imputer),
    ("estimator", LGBMClassifier())
])
pipe

In [None]:
param_grid = {
    "estimator" : [DummyClassifier(), LGBMClassifier()]
    }


In [None]:
grid = GridSearchCV(pipe, 
                    param_grid, 
                    cv=StratifiedShuffleSplit(n_splits=10, test_size=0.33), 
                    return_train_score=True, 
                    n_jobs=-1,
                    verbose=1)

## Train

In [None]:
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
cols = [i for i in res.columns if ("split" not in i) ]
res = res.loc[:, cols]
res.sort_values("mean_test_score", ascending=False).round(2)

## Test

In [None]:
grid.score(X_train, y_train)

In [None]:
model = grid.best_estimator_
model

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
pre = grid.best_estimator_['preprocessor']
pre


In [None]:
imp = grid.best_estimator_['imputer']
imp

In [None]:
est = grid.best_estimator_['estimator']
est

# export

In [None]:
with open("./models/model.pk", "wb") as f : 
    pk = pickle.dumps(model)
    f.write(pk)

In [None]:
with open("./models/preprocessor.pk", "wb") as f : 
    pk = pickle.dumps(pre)
    f.write(pk)

In [None]:
with open("./models/imputer.pk", "wb") as f : 
    pk = pickle.dumps(imp)
    f.write(pk)

In [None]:
with open("./models/estimator.pk", "wb") as f : 
    pk = pickle.dumps(est)
    f.write(pk)

In [None]:
display(len(X_train))
# display(len(X_train_PassengerId))
display(len(y_train))

In [None]:
len(y_train)

In [None]:
X_train["Survived"] = y_train
# X_train["PassengerId"] = X_train_PassengerId
X_train.head()

In [None]:
X_train.isna().sum()

In [None]:
X_train.to_csv("./data/cleaned/df_train.csv", index=False)

In [None]:
X_test["Survived"] = y_test
# X_test["PassengerId"] = X_test_PassengerId
X_test.head()

In [None]:
X_test.isna().sum()

In [None]:
X_test.to_csv("./data/cleaned/df_test.csv", index=False)