Find the markdown blocks that say interaction required! The notebook should take care of the rest!

# Import libs

In [1]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.data_analysis import FeatureAnalysis, NullAnalysis
from eflow.model_analysis import ClassificationAnalysis
from eflow.data_pipeline_segments import FeatureTransformer, TypeFixer, DataEncoder, FeatureDataCleaner
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.math_utils import get_unbalanced_threshold
from eflow.utils.sys_utils import create_dir_structure
from eflow.utils.eflow_utils import create_color_dict_for_features
from eflow.utils.pandas_utils import missing_values_table,data_types_table, value_counts_table, suggest_removal_features 
from eflow.widgets import ColorLabelingWidget

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import copy
from IPython.display import clear_output
from IPython.core.getipython import get_ipython
import ipython_blocking

In [2]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [3]:
%matplotlib notebook
%matplotlib inline

## Declare Project Variables

### Interaction required

In [4]:
dataset_path = "Datasets/titanic_train.csv"

# -----
dataset_name = "Titanic Data"
pipeline_name = "Titanic Pipeline"

# -----


# -----
notebook_mode = True

## Clean out segment space

In [5]:
remove_unconnected_pipeline_segments()

# Import dataset

In [6]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,891,12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Age,float64
Fare,float64
PassengerId,int64
Survived,int64
Pclass,int64
SibSp,int64
Parch,int64
Name,object
Sex,object
Ticket,object


# Loading and init df_features

In [8]:
# Option: 1
# df_features = get_type_holder_from_pipeline(pipeline_name)

In [9]:
# Option: 2
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [10]:
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Pclass,string
Embarked,string
Sex,string
Cabin,string
Survived,bool
Parch,integer
SibSp,integer
Age,float
Fare,float


# Any extra processing before eflow DataPipeline

# Setup pipeline structure

### Interaction Required

In [11]:
main_pipe = DataPipeline(pipeline_name,
                         df,
                         df_features)

The file 'root_pipeline.json' exist!
Now configuring object with proper pipeline segments...
Removing the feature: "Name"
Removing the feature: "PassengerId"
Removing the feature: "Ticket"


In [12]:
main_pipe.perform_pipeline(df,
                           df_features)

In [13]:
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.000000,1,0,7.2500,False,False,True,False,False,True,False,True
1,1,38.000000,1,0,71.2833,True,False,False,True,False,False,True,False
2,1,26.000000,0,0,7.9250,False,False,True,False,False,True,True,False
3,1,35.000000,1,0,53.1000,False,False,True,True,False,False,True,False
4,0,35.000000,0,0,8.0500,False,False,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,False,False,True,False,True,False,False,True
887,1,19.000000,0,0,30.0000,False,False,True,True,False,False,True,False
888,0,29.085667,1,2,23.4500,False,False,True,False,False,True,True,False
889,1,26.000000,0,0,30.0000,True,False,False,True,False,False,False,True


# Seperate out data into train and test sets

In [14]:
X = df.drop(columns=df_features.target_feature()).values
y = df[df_features.target_feature()].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.80, random_state=517, stratify=y,
)

# Fit Models and view results

In [16]:
# Find best parameters for model
param_grid = {
    "max_depth": list(range(1, 4)),
#     "min_samples_leaf": list(range(80, 130, 5)),
#     "criterion": ["gini", "entropy"],
#     "n_splits": [20, 30]
}

model, best_params = optimize_model_grid(
    model=DecisionTreeClassifier(),
    X_train=X_train,
    y_train=y_train,
    param_grid=param_grid,
    scoring="f1_micro"
)

Tuned Parameters: {'max_depth': 1}
Best score on trained data was 0.797753


In [17]:
model_name = repr(model).split("(")[0]

In [18]:
dt_model_analysis = ClassificationAnalysis(dataset_name=dataset_name,
                                           model=model,
                                           model_name=model_name,
                                           target_feature=df_features.target_feature(),
                                           pred_funcs_dict={"Probabilities function":model.predict_proba,
                                                            "Predict function":model.predict},
                                           sample_data=X_train[0],
                                           project_sub_dir='Classification Analysis',
                                           notebook_mode=notebook_mode,
                                           df_features=df_features)

In [19]:
dt_model_analysis.perform_analysis(X=X_train,
                                   y=y_train,
                                   dataset_name="Train Data",
                                   thresholds_matrix=[[.0,.0],],
                                   display_visuals=True)



------------------------------Train Data------------------------------
Now running classification on Probabilities function on thresholds:
	Target Value:0: Prediction weight: 0.0
	Target Value:1: Prediction weight: 0.0
----------------------------------------------------------------------

Now running classification on Probabilities function on no thresholds.
----------------------------------------------------------------------





Now running classification on Predict function
----------------------------------------------------------------------



In [20]:
dt_model_analysis.perform_analysis(X=X_test,
                                   y=y_test,
                                   dataset_name="Test Data",
                                   display_visuals=True)



------------------------------Test Data------------------------------
Now running classification on Probabilities function on no thresholds.
---------------------------------------------------------------------

Now running classification on Probabilities function on no thresholds.
---------------------------------------------------------------------





Now running classification on Predict function
---------------------------------------------------------------------

