Find the markdown blocks that say interaction required! The notebook should take care of the rest!

# Import libs

In [3]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.model_analysis import ClassificationAnalysis
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.pandas_utils import data_types_table

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import copy
from IPython.display import clear_output

In [None]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [None]:
%matplotlib notebook
%matplotlib inline

## Declare Project Variables

### Interaction required

In [None]:
dataset_path = "Datasets/titanic_train.csv"

# -----
dataset_name = "Titanic Data"
pipeline_name = "Titanic Pipeline"

# -----


# -----
notebook_mode = True

## Clean out segment space

In [None]:
remove_unconnected_pipeline_segments()

# Import dataset

In [None]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

In [None]:
data_types_table(df)

# Loading and init df_features

In [None]:
# Option: 1
# df_features = get_type_holder_from_pipeline(pipeline_name)

In [None]:
# Option: 2
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [None]:
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

# Any extra processing before eflow DataPipeline

# Setup pipeline structure

### Interaction Required

In [None]:
main_pipe = DataPipeline(pipeline_name,
                         df,
                         df_features)

In [None]:
main_pipe.perform_pipeline(df,
                           df_features)

In [None]:
df

# Seperate out data into train and test sets

In [None]:
X = df.drop(columns=df_features.target_feature()).values
y = df[df_features.target_feature()].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=517, stratify=y,
)

In [None]:
feature_order = list(df.columns)

In [None]:
del df

# Fit Models and view results

In [None]:
# Find best parameters for model
param_grid = {
    "max_depth": list(range(1, 4)),
#     "min_samples_leaf": list(range(80, 130, 5)),
#     "criterion": ["gini", "entropy"],
#     "n_splits": [20, 30]
}

model, best_params = optimize_model_grid(
    model=DecisionTreeClassifier(),
    X_train=X_train,
    y_train=y_train,
    param_grid=param_grid,
    scoring="f1_micro"
)

In [None]:
model_name = repr(model).split("(")[0]

In [None]:
model_analysis = ClassificationAnalysis(dataset_name=dataset_name,
                                        model=model,
                                        model_name=model_name,
                                        feature_order=feature_order,
                                        target_feature=df_features.target_feature(),
                                        pred_funcs_dict={"Probabilities function":model.predict_proba,
                                                         "Predict function":model.predict},
                                        sample_data=X_train[0],
                                        notebook_mode=notebook_mode,
                                        df_features=df_features)

In [None]:
model_analysis.perform_analysis(X=X_train,
                                y=y_train,
                                dataset_name="Train Data",
                                thresholds_matrix=[[.0,.0],
                                                   [.12,.04]],
                                classification_error_analysis=True,
                                classification_correct_analysis=True)

In [None]:
model_analysis.perform_analysis(X=X_test,
                                y=y_test,
                                dataset_name="Test Data",
                                thresholds_matrix=[[.0,.0],
                                                   [.12,.04]],
                                classification_error_analysis=True,
                                classification_correct_analysis=True)