Find the markdown blocks that say interaction required! The notebook should take care of the rest!

# Import libs

In [1]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.data_analysis import FeatureAnalysis, NullAnalysis
from eflow.model_analysis import ClassificationAnalysis
from eflow.data_pipeline_segments import FeatureTransformer, TypeFixer, DataEncoder, FeatureDataCleaner
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.math_utils import get_unbalanced_threshold
from eflow.utils.sys_utils import create_dir_structure
from eflow.utils.eflow_utils import create_color_dict_for_features
from eflow.utils.pandas_utils import missing_values_table,data_types_table, value_counts_table, suggest_removal_features 
from eflow.widgets import ColorLabelingWidget

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import copy
from IPython.display import clear_output
from IPython.core.getipython import get_ipython
import ipython_blocking

In [2]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [3]:
%matplotlib notebook
%matplotlib inline

## Juypter notebook generating cells

### Important Note: Replace if set to True will remove all the contents of whatever cell it is called in. But it can be undone with a simple CMD + Z. 🙂

In [4]:
# Author: http://tinyurl.com/y6mghyzl
def create_new_cell(contents,
                    replace=False):
    """
    Desc:
        Creates a new jupyter cell.
    """
    shell = get_ipython()
    shell.set_next_input(contents,
                         replace=replace)

def __format_list_to_string(list_name,
                            list_contents):
    """
    Desc:
        Converts a list to a string and adds newlines for formating.
    """
    output_str = f"{list_name} = ["
    escape_seq_count = 0
    final_index = len(list_contents) - 1
    req_spacing = len(output_str)

    for i,element in enumerate(list_contents):
        if i == final_index:
            if isinstance(element,str):
                output_str += f'\"{element}\"'
            else:
                output_str += f'{element}'
        else:

            if isinstance(element,str):
                output_str += f'\"{element}\",'
            else:
                output_str += f'{element},'
        
        if len(output_str.split("\n")[escape_seq_count]) > 78:
            output_str += "\n"
            output_str += (" " * req_spacing)
            escape_seq_count += 1
    output_str += "]"
    return output_str

def create_new_cell_with_removal_features(df,
                                          replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested features to remove.
    
    Args:
        df:
            Pandas DataFrame object
            
        replace:
            Boolean to determine replacing the current cell.
    """
    
    # Get suggestions for removal
    cell_content = __format_list_to_string("removal_features",
                                           suggest_removal_features(df))
    # Add a sort of calling card of the function that created it
    cell_content = f"# create_new_cell_with_removal_features(df,replace={replace})\n" + cell_content
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_null_removal_features(df,
                                               null_threshold=.25,
                                               replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested features to remove based on nulls.
    
    Args:
        df:
            Pandas DataFrame object
            
        null_threshold:
            Any features that contain x% percent of nulls are suggested.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    mis_val = df.isnull().sum()
    mis_val_percent = df.isnull().sum() / len(df)
    
    cell_content = f"# create_new_cell_with_null_removal_features(df,null_threshold={null_threshold},replace={replace})\n"
    cell_content += __format_list_to_string("remove_null_features",
                                            mis_val_percent[mis_val_percent > null_threshold].index.to_list())
    # Add a calling card of the function that created it
    
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_feature_value_color_dict(df,
                                                  df_features,
                                                  value_limit=50,
                                                  replace=True):
    """
    Desc:
        Creates a new cell block with a dict of suggested feature value colors.
    
    Args:
        df:
            Pandas DataFrame object
        
        df_features:
            DataFrameTypes object.
        
        null_threshold:
            Any features that contain x% percent of nulls are suggested.
            
        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    feature_value_color_dict = create_color_dict_for_features(df,
                                                              df_features,
                                                              value_limit)
    # Add a sort of calling card of the function that created it
    cell_content = ""
    cell_content += f"# create_new_cell_with_feature_value_color_dict(df,df_features,value_limit={value_limit},replace={replace})\n"
    cell_content += "feature_value_color_dict=dict()"
    feature_count = 0
    for feature_name, feature_value_color in feature_value_color_dict.items():
        if feature_value_color_dict[feature_name].keys(): 
            cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"] = dict()"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give colors to each!"
        
        for feature_value, color in feature_value_color.items():

            color = feature_value_color_dict[feature_name][feature_value]
            
            if feature_name in df_features.get_bool_features() or feature_name in df_features.get_categorical_features():
                try:
                    feature_value = int(float(feature_value))
                except:
                    pass
            
            if isinstance(feature_value,str):
                feature_value = f"\"{feature_value}\""
            else:
                feature_value = f"{feature_value}"
            
            if color is None:
                cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"][{feature_value}] = None"
            else:
                cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"][{feature_value}] = \"{color}\""
        cell_content += "\n"
        
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_categorical_dict(df,
                                          df_features,
                                          value_limit=50,
                                          replace=True):
    """
    Desc:
        Creates a new cell block with a dict of
    
    Args:
        df:
            Pandas DataFrame object

        df_features:
            DataFrameTypes object.

        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """

    cell_content = ""
    cell_content += f"# create_new_cell_with_categorical_dict(df,df_features,value_limit={value_limit},replace={replace})\n"
    cell_content += "categorical_value_dict = dict()\n"
    
    categorical_value_dict = dict()
    for feature_name in df_features.get_categorical_features():
        
        # Find and sort feature values
        feature_values = df[feature_name].value_counts(sort=False).index.to_list()
        feature_values = [str(val) for val in feature_values]
        feature_values.sort()
        
        # Create feature cat dict
        cat_found = False
        categorical_value_dict[feature_name] = dict()
        for val in feature_values:
            try:
                categorical_value_dict[feature_name][int(val)] = ""
                cat_found = True
            except ValueError:
                pass
        
        # Delete feature name if no categories are found
        if not cat_found:
            del categorical_value_dict[feature_name]
    
    for feature_name,cat_val_dict in categorical_value_dict.items():
        
        if len(cat_val_dict.keys()) < value_limit:
            cell_content += f"categorical_value_dict[\"{feature_name}\"]=dict()\n"
            for cat,val in cat_val_dict.items():

                if isinstance(val,str):
                    cell_content += f"categorical_value_dict[\"{feature_name}\"][{cat}] = \"{val}\"\n"
                else:
                    cell_content += f"categorical_value_dict[\"{feature_name}\"][{cat}] = {val}\n"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give encode to each!"

        

    create_new_cell(cell_content,
                    replace=replace)
    
    

def create_new_cell_with_value_representation(df,
                                              df_features,
                                              value_limit=50,
                                              replace=True):
    """
    Desc:
        Creates a new cell block with a dict of suggested feature value colors.
    
    Args:
        df:
            Pandas DataFrame object

        df_features:
            DataFrameTypes object.

        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    feature_value_representation = dict()
    for feature_name in df_features.get_string_features():
        feature_value_representation[feature_name] = dict()
        for val in df[feature_name].dropna().value_counts(sort=False).index.to_list():
            if isinstance(val,str):
                if len(val) == 0:
                    continue
                if len(val) <= 3 or val not in words.words():
                    feature_value_representation[feature_name][val] = ""

                if len(feature_value_representation[feature_name].keys()) >= 50:
                    break

        if not len(feature_value_representation[feature_name].keys()):
            del feature_value_representation[feature_name]
    cell_content = ""
    cell_content += f"# create_new_cell_with_value_representation(df,df_features,value_limit={value_limit},replace={replace})\n"
    
    cell_content += "feature_value_representation = dict()\n"
    for feature_name,val_repr_dict in feature_value_representation.items():
        
        if len(val_repr_dict.keys()) < value_limit:
            cell_content += f"feature_value_representation[\"{feature_name}\"] = dict()\n"
            for val,reprs in val_repr_dict.items():

                if isinstance(val,str):
                    cell_content += f"feature_value_representation[\"{feature_name}\"][\"{val}\"] = "
                else:
                    cell_content += f"feature_value_representation[\"{feature_name}\"][{val}] = "
                
                if isinstance(reprs,str):
                    cell_content += f"\"{reprs}\"\n"
                else:
                    cell_content += f"{reprs}\n"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give representation to to each!"
        
        cell_content += "\n"
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_binned_features(df,
                                         df_features,
                                         bins=5,
                                         replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested bins and labels for each feature.
    
    Args:
        df:pd.Dataframe
            Pandas DataFrame object.
        
        df_features:
            DataFrameTypes object.
            
        bins:int
            The amount of bins to give to apply to each feature
            
        replace:bool
            Boolean to determine replacing the current cell.
    """
    
    # Add a sort of calling card of the function that created it
    cell_content = f"# create_new_cell_with_binned_features(df,df_features,bins={bins},replace={replace})\n"
    
    for feature_name in df_features.continuous_numerical_features():
        bins,labels = auto_binning(df,
                                   df_features,
                                   feature_name,
                                   bins=5)
        print(bins)
        cell_content += f"feature_name = \"{feature_name}\"\n"
        cell_content += __format_list_to_string("bins",
                                                bins)
        cell_content += "\n"
        cell_content += __format_list_to_string("labels",
                                                labels)
        
        cell_content += f"\ndf_features.set_feature_binning(feature_name,\n"
        cell_content += "                                bins,\n"
        cell_content += "                                labels)\n"
        cell_content += "\n\n"
    
    create_new_cell(cell_content,
                    replace=replace)

## Declare Project Variables

### Interaction required

In [5]:
dataset_path = "Datasets/titanic_train.csv"

# -----
dataset_name = "Titanic Data"
pipeline_name = "Titanic Pipeline"

# -----


# -----
notebook_mode = True

## Clean out segment space

In [6]:
remove_unconnected_pipeline_segments()

# Import dataset

In [7]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,891,12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Import Messy data (Testing purposes only!!!)

In [8]:
# df["Sex"][0] = np.nan
# df["Survived"] = df["Survived"].astype("object")
# df["Age"] = df["Age"].astype('object')
# df["Age"][33] = "      "
# df["Pclass"][50] = "2sdf,asdqw"
# df["SibSp"][2] = np.nan

In [9]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Age,float64
Fare,float64
PassengerId,int64
Survived,int64
Pclass,int64
SibSp,int64
Parch,int64
Name,object
Sex,object
Ticket,object


# Loading and init df_features

In [10]:
# Option: 1
# df_features = get_type_holder_from_pipeline(pipeline_name)

In [11]:
# Option: 2
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [12]:
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Cabin,string
Sex,string
Embarked,string
Pclass,string
Survived,bool
Parch,integer
SibSp,integer
Fare,float
Age,float


# Any extra processing before eflow DataPipeline

In [13]:
display({val[0] for val in set(df["Cabin"].dropna().values)})
df["Cabin"] = [val[0] if isinstance(val,str) else val for val in df["Cabin"]]
df["Cabin"]

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'}

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object

# Setup pipeline structure

### Interaction Required

In [14]:
main_pipe = DataPipeline(pipeline_name,
                         df,
                         df_features,
                         remove_past_contents=True)

The file 'root_pipeline.json' exist!
Moving past contents to eFlow's garbage.
Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"


In [15]:
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C,S
4,0,3,male,35.0,0,0,8.05,,S
5,0,3,male,,0,0,8.4583,,Q
6,0,1,male,54.0,0,0,51.8625,E,S
7,0,3,male,2.0,3,1,21.075,,S
8,1,3,female,27.0,0,2,11.1333,,S
9,1,2,female,14.0,1,0,30.0708,,C


## Remove Unwanted Columns due to illogical nulls

### Interaction required

### Any features that have to many nulls/we can't or shouldn't perform any special logic to determine the closest or actual value

In [16]:
# create_new_cell_with_null_removal_features(df,null_threshold=0.25,replace=True)
remove_null_features = ["Cabin"]

### Add to main pipeline

In [17]:
if len(remove_null_features):
    feature_transformer = FeatureTransformer()
    feature_transformer.remove_features(df,
                                        df_features,
                                        remove_null_features)
    main_pipe.add("Remove unresolvable null features",
                  feature_transformer)

    del feature_transformer

In [18]:
missing_table = missing_values_table(df)
display(missing_table)
nan_features = missing_table.index.to_list()
nan_features

Unnamed: 0,Missing Values,% of Total Values
Age,177,19.9
Embarked,2,0.2


['Age', 'Embarked']

In [19]:
df["Parch"].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

# Data Cleaning

In [20]:
data_cleaner = FeatureDataCleaner()

In [21]:
data_cleaner.run_widget(df,
                        df_features,
                        nan_feature_names=df.columns.to_list())

interactive(children=(Select(description='Features', layout=Layout(height='175px', width='50%'), options=('Sur…

In [22]:
# 1/0

In [23]:
data_cleaner.perform_saved_widget_input(df,
                                        df_features)

******************************
Testing function
ignore_feature(df,df_features,"Survived",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"Pclass",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"Sex",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"Age",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"SibSp",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"Parch",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_features,"Fare",)
PASSED TEST!
******************************

******************************
Testing function
ignore_feature(df,df_f

In [24]:
missing_table = missing_values_table(df)
display(missing_table)
remaing_nan_features = missing_table.index.to_list()
remaing_nan_features

Unnamed: 0,Missing Values,% of Total Values
Age,177,19.9
Embarked,2,0.2


['Age', 'Embarked']

In [25]:
# import datawig

# df_train, df_test = datawig.utils.random_split(df)

# #Initialize a SimpleImputer model
# imputer = datawig.SimpleImputer(
#     input_columns=['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin','Embarked'], # column(s) containing information about the column we want to impute
#     output_column= 'Age', # the column we'd like to impute values for
#     output_path = 'imputer_model' # stores model data and metrics
#     )

# #Fit an imputer model on the train data
# imputer.fit(train_df=df, num_epochs=200)

In [26]:
# imputed_training=mice(df[df_features.numerical_features()].values)

In [27]:
main_pipe.add("Cleaning features with methods that only apply to that one feature.",
              data_cleaner)

## Remove any remaining nulls

In [28]:
data_cleaner.file_path

'/Users/ericcacciavillani/Desktop/Coding/Python_Files/Artificial_Intelligence/Data Mining/eFlow/testing/eflow Data/_Extras/Pipeline Structure/Data Pipeline Segments/FeatureDataCleaner/CC6E23EB34.json'

# String Cleaning

# Dummy encode data

In [29]:
data_encoder = DataEncoder()

In [30]:
data_encoder.apply_value_representation(df,
                                        df_features)

In [31]:
qualtative_features = df_features.string_features() | df_features.categorical_features()

In [32]:
data_encoder.make_dummies(df,
                          df_features,
                          qualtative_features=qualtative_features)

In [33]:
df_features.display_features()

Bool Features: {'Sex_female', 'Survived', 'Embarked_Southampton', 'Embarked_Queenstown', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_Cherbourg', 'Sex_male'}

------------------------------------------------------------------------------------------
Numerical Features: {'Fare', 'SibSp', 'Age', 'Parch'}

Integer Features: {'Parch', 'SibSp'}

Float Features: {'Fare', 'Age'}

------------------------------------------------------------------------------------------
Target Feature: Survived



In [34]:
data_encoder.make_values_bool(df,
                              df_features)

In [35]:
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.2500,False,True,False,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.0,0,0,7.9250,True,False,False,False,True,False,False,True
3,1,35.0,1,0,53.1000,True,False,False,False,True,True,False,False
4,0,35.0,0,0,8.0500,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,False,True,False,False,True,False,True,False
887,1,19.0,0,0,30.0000,True,False,False,False,True,True,False,False
888,0,,1,2,23.4500,True,False,False,False,True,False,False,True
889,1,26.0,0,0,30.0000,False,True,True,False,False,True,False,False


In [36]:
main_pipe.add("Ensure values are in proper form; convert proper values to dummies!",
              data_encoder)

# Test if the pipeline structure works on samples of the data and the entire set of the data

In [37]:
main_tmp_df = pd.read_csv(dataset_path)

In [39]:
sizes = [1,main_tmp_df.shape[0],5,15,20]

In [40]:
for size in sizes:
    tmp_df_features = DataFrameTypes()
    tmp_df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")
    tmp_df = main_tmp_df[0:size]
    failure_found = False
    
    main_pipe.perform_pipeline(tmp_df,
                               tmp_df_features)
    
    print(f"size of test data: {size}")
    
    if df_features == tmp_df_features:
        print("df_features PASSED TEST! Properly changed in the pipeline!")
    else:
        print("df_features FAILED TEST! Didn't properly change in the pipeline")
        failure_found = True

    if set(df.columns) == set(tmp_df.columns):
        print("Field name check PASSED TEST! Field names were as expected.")
    else:
        print("Field name check FAILED TEST! Field names did differ from expected.")
        failure_found = True
        
    if failure_found:
        print("tmp_df")
        display(tmp_df)
        print()
        print("df")
        display(df[0:size])

    print("***" * 10)
    print()

Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Ignore feature:  Survived
Ignore feature:  Pclass
Ignore feature:  Sex
Ignore feature:  Age
Ignore feature:  SibSp
Ignore feature:  Parch
Ignore feature:  Fare
Ignore feature:  Embarked
size of test data: 1
df_features PASSED TEST! Properly changed in the pipeline!
Field name check FAILED TEST! Field names did differ from expected.
tmp_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_feature] = df[cat_feature] == feature_value


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S



df


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,False,True,False,False,True,False,False,True


******************************

Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Ignore feature:  Survived
Ignore feature:  Pclass
Ignore feature:  Sex
Ignore feature:  Age
Ignore feature:  SibSp
Ignore feature:  Parch
Ignore feature:  Fare
Ignore feature:  Embarked
size of test data: 891
df_features PASSED TEST! Properly changed in the pipeline!
Field name check FAILED TEST! Field names did differ from expected.
tmp_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C



df


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.2500,False,True,False,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.0,0,0,7.9250,True,False,False,False,True,False,False,True
3,1,35.0,1,0,53.1000,True,False,False,False,True,True,False,False
4,0,35.0,0,0,8.0500,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,False,True,False,False,True,False,True,False
887,1,19.0,0,0,30.0000,True,False,False,False,True,True,False,False
888,0,,1,2,23.4500,True,False,False,False,True,False,False,True
889,1,26.0,0,0,30.0000,False,True,True,False,False,True,False,False


******************************

Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Ignore feature:  Survived
Ignore feature:  Pclass
Ignore feature:  Sex
Ignore feature:  Age
Ignore feature:  SibSp
Ignore feature:  Parch
Ignore feature:  Fare
Ignore feature:  Embarked
size of test data: 5
df_features PASSED TEST! Properly changed in the pipeline!
Field name check FAILED TEST! Field names did differ from expected.
tmp_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



df


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,False,True,False,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.0,0,0,7.925,True,False,False,False,True,False,False,True
3,1,35.0,1,0,53.1,True,False,False,False,True,True,False,False
4,0,35.0,0,0,8.05,False,True,False,False,True,False,False,True


******************************

Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Ignore feature:  Survived
Ignore feature:  Pclass
Ignore feature:  Sex
Ignore feature:  Age
Ignore feature:  SibSp
Ignore feature:  Parch
Ignore feature:  Fare
Ignore feature:  Embarked
size of test data: 15
df_features PASSED TEST! Properly changed in the pipeline!
Field name check FAILED TEST! Field names did differ from expected.
tmp_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C



df


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,False,True,False,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.0,0,0,7.925,True,False,False,False,True,False,False,True
3,1,35.0,1,0,53.1,True,False,False,False,True,True,False,False
4,0,35.0,0,0,8.05,False,True,False,False,True,False,False,True
5,0,,0,0,8.4583,False,True,False,True,False,False,False,True
6,0,54.0,0,0,51.8625,False,True,False,False,True,True,False,False
7,0,2.0,3,1,21.075,False,True,False,False,True,False,False,True
8,1,27.0,0,2,11.1333,True,False,False,False,True,False,False,True
9,1,14.0,1,0,30.0708,True,False,True,False,False,False,True,False


******************************

Removing the feature: "Name"
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Ignore feature:  Survived
Ignore feature:  Pclass
Ignore feature:  Sex
Ignore feature:  Age
Ignore feature:  SibSp
Ignore feature:  Parch
Ignore feature:  Fare
Ignore feature:  Embarked
size of test data: 20
df_features PASSED TEST! Properly changed in the pipeline!
Field name check FAILED TEST! Field names did differ from expected.
tmp_df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C



df


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,False,True,False,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.0,0,0,7.925,True,False,False,False,True,False,False,True
3,1,35.0,1,0,53.1,True,False,False,False,True,True,False,False
4,0,35.0,0,0,8.05,False,True,False,False,True,False,False,True
5,0,,0,0,8.4583,False,True,False,True,False,False,False,True
6,0,54.0,0,0,51.8625,False,True,False,False,True,True,False,False
7,0,2.0,3,1,21.075,False,True,False,False,True,False,False,True
8,1,27.0,0,2,11.1333,True,False,False,False,True,False,False,True
9,1,14.0,1,0,30.0708,True,False,True,False,False,False,True,False


******************************



In [None]:
df_features.display_features()

In [None]:
tmp_df

In [None]:
tmp_df_features.display_features()

In [None]:
from inspect import signature

def test(a=[1,23,1],b=12):
    print(a)
dict(signature(test).parameters)
import inspect

def get_parameters(f):
    return inspect.getfullargspec(f)[0]
magical_way(test)

In [None]:
df[0:3]["Embarked_Cherbourg"]

# Seperate out data into train and test sets

In [None]:
X = df.drop(columns=df_features.target_feature()).values
y = df[df_features.target_feature()].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.80, random_state=517, stratify=y,
)

# Fit Models and view results

In [None]:
# Find best parameters for model
param_grid = {
    "max_depth": list(range(1, 4)),
#     "min_samples_leaf": list(range(80, 130, 5)),
#     "criterion": ["gini", "entropy"],
#     "n_splits": [20, 30]
}

model, best_params = optimize_model_grid(
    model=DecisionTreeClassifier(),
    X_train=X_train,
    y_train=y_train,
    param_grid=param_grid,
    scoring="f1_micro"
)

In [None]:
model_name = repr(model).split("(")[0]

In [None]:
dt_model_analysis = ClassificationAnalysis(dataset_name=dataset_name,
                                           model=model,
                                           model_name=model_name,
                                           target_feature=df_features.target_feature(),
                                           pred_funcs_dict={"Probabilities function":model.predict_proba,
                                                            "Predict function":model.predict},
                                           sample_data=X_train[0],
                                           project_sub_dir=f'Classification Analysis',
                                           notebook_mode=notebook_mode,
                                           df_features=df_features)

In [None]:
dt_model_analysis.perform_analysis(X=X_train,
                                   y=y_train,
                                   dataset_name="Train Data",
                                   thresholds_matrix=[[.0,.0],])

In [None]:
dt_model_analysis.perform_analysis(X=X_test,
                                   y=y_test,
                                   dataset_name="Test Data")