# Import libs

In [1]:
import sys
import os

from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.data_analysis import FeatureAnalysis, NullAnalysis
from eflow.model_analysis import ClassificationAnalysis
from eflow.data_pipeline_segments import FeatureTransformer, DataEncoder
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.math_utils import get_unbalanced_threshold
from eflow.utils.sys_utils import create_dir_structure
from eflow.utils.eflow_utils import create_color_dict_for_features
from eflow.utils.pandas_utils import data_types_table, value_counts_table, suggest_removal_features, missing_values_table, df_auto_binning
from eflow.widgets import ColorLabelingWidget

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy
from IPython.display import clear_output
from IPython.core.getipython import get_ipython
import ipython_blocking
import nltk

ModuleNotFoundError: No module named 'eflow'

In [4]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [5]:
%matplotlib notebook
%matplotlib inline

#### Download natural language processing utils

In [6]:
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericcacciavillani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/ericcacciavillani/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ericcacciavillani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Juypter notebook generating cells

### Important Note: Replace if set to True will remove all the contents of whatever cell it is called in. But it can be undone with a simple CMD + Z. 🙂

In [5]:
# Author: http://tinyurl.com/y6mghyzl
def create_new_cell(contents,
                    replace=False):
    """
    Desc:
        Creates a new jupyter cell.
    """
    shell = get_ipython()
    shell.set_next_input(contents,
                         replace=replace)

def __format_list_to_string(list_name,
                            list_contents):
    """
    Desc:
        Converts a list to a string and adds newlines for formating.
    """
    output_str = f"{list_name} = ["
    escape_seq_count = 0
    final_index = len(list_contents) - 1
    req_spacing = len(output_str)

    for i,element in enumerate(list_contents):
        if i == final_index:
            if isinstance(element,str):
                output_str += f'\"{element}\"'
            else:
                output_str += f'{element}'
        else:

            if isinstance(element,str):
                output_str += f'\"{element}\",'
            else:
                output_str += f'{element},'
        
        if len(output_str.split("\n")[escape_seq_count]) > 78:
            output_str += "\n"
            output_str += (" " * req_spacing)
            escape_seq_count += 1
    output_str += "]"
    return output_str

def create_new_cell_with_removal_features(df,
                                          replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested features to remove.
    
    Args:
        df:
            Pandas DataFrame object
            
        replace:
            Boolean to determine replacing the current cell.
    """
    
    # Get suggestions for removal
    cell_content = __format_list_to_string("removal_features",
                                           suggest_removal_features(df))
    # Add a sort of calling card of the function that created it
    cell_content = f"# create_new_cell_with_removal_features(df,replace={replace})\n" + cell_content
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_null_removal_features(df,
                                               null_threshold=.25,
                                               replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested features to remove based on nulls.
    
    Args:
        df:
            Pandas DataFrame object
            
        null_threshold:
            Any features that contain x% percent of nulls are suggested.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    mis_val = df.isnull().sum()
    mis_val_percent = df.isnull().sum() / len(df)
    
    cell_content = f"# create_new_cell_with_null_removal_features(df,null_threshold={null_threshold},replace={replace})\n"
    cell_content += __format_list_to_string("remove_null_features",
                                            mis_val_percent[mis_val_percent > null_threshold].index.to_list())
    # Add a calling card of the function that created it
    
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_feature_value_color_dict(df,
                                                  df_features,
                                                  value_limit=50,
                                                  replace=True):
    """
    Desc:
        Creates a new cell block with a dict of suggested feature value colors.
    
    Args:
        df:
            Pandas DataFrame object
        
        df_features:
            DataFrameTypes object.
        
        null_threshold:
            Any features that contain x% percent of nulls are suggested.
            
        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    feature_value_color_dict = create_color_dict_for_features(df,
                                                              df_features,
                                                              value_limit)
    # Add a sort of calling card of the function that created it
    cell_content = ""
    cell_content += f"# create_new_cell_with_feature_value_color_dict(df,df_features,value_limit={value_limit},replace={replace})\n"
    cell_content += "feature_value_color_dict=dict()"
    feature_count = 0
    for feature_name, feature_value_color in feature_value_color_dict.items():
        
        if feature_name in df_features.numerical_features():
            continue
        
        if feature_value_color_dict[feature_name].keys(): 
            cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"] = dict()"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give colors to each!"
        
        for feature_value, color in feature_value_color.items():

            color = feature_value_color_dict[feature_name][feature_value]
            
            if feature_name in df_features.bool_features() or feature_name in df_features.categorical_features():
                try:
                    feature_value = int(float(feature_value))
                except:
                    pass
            
            if isinstance(feature_value,str):
                feature_value = f"\"{feature_value}\""
            else:
                feature_value = f"{feature_value}"
            
            if color is None:
                cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"][{feature_value}] = None"
            else:
                cell_content += f"\nfeature_value_color_dict[\"{feature_name}\"][{feature_value}] = \"{color}\""
        cell_content += "\n"
        
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_categorical_dict(df,
                                          df_features,
                                          value_limit=50,
                                          replace=True):
    """
    Desc:
        Creates a new cell block with a dict of
    
    Args:
        df:
            Pandas DataFrame object

        df_features:
            DataFrameTypes object.

        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """

    cell_content = ""
    cell_content += f"# create_new_cell_with_categorical_dict(df,df_features,value_limit={value_limit},replace={replace})\n"
    cell_content += "categorical_value_dict = dict()\n"
    
    categorical_value_dict = dict()
    for feature_name in df_features.categorical_features():
        
        # Find and sort feature values
        feature_values = df[feature_name].value_counts(sort=False).index.to_list()
        feature_values = [str(val) for val in feature_values]
        feature_values.sort()
        
        # Create feature cat dict
        cat_found = False
        categorical_value_dict[feature_name] = dict()
        for val in feature_values:
            try:
                categorical_value_dict[feature_name][int(val)] = ""
                cat_found = True
            except ValueError:
                pass
        
        # Delete feature name if no categories are found
        if not cat_found:
            del categorical_value_dict[feature_name]
    
    for feature_name,cat_val_dict in categorical_value_dict.items():
        
        if len(cat_val_dict.keys()) < value_limit:
            cell_content += f"categorical_value_dict[\"{feature_name}\"]=dict()\n"
            for cat,val in cat_val_dict.items():

                if isinstance(val,str):
                    cell_content += f"categorical_value_dict[\"{feature_name}\"][{cat}] = \"{val}\"\n"
                else:
                    cell_content += f"categorical_value_dict[\"{feature_name}\"][{cat}] = {val}\n"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give encode to each!"

        

    create_new_cell(cell_content,
                    replace=replace)
    
    

def create_new_cell_with_value_representation(df,
                                              df_features,
                                              value_limit=50,
                                              replace=True):
    """
    Desc:
        Creates a new cell block with a dict of suggested feature value colors.
    
    Args:
        df:
            Pandas DataFrame object

        df_features:
            DataFrameTypes object.

        value_limit:
            Limit the amount of feature_values until the system will ignore
            the feature all together for dict generation.
            
        replace:
            Boolean to determine replacing the current cell.
    """
    feature_value_representation = dict()
    for feature_name in df_features.string_features():
        feature_value_representation[feature_name] = dict()
        for val in df[feature_name].dropna().value_counts(sort=False).index.to_list():
            print(val)
            if isinstance(val,str):
                if len(val) == 0:
                    continue

                feature_value_representation[feature_name][val] = ""

                if len(feature_value_representation[feature_name].keys()) >= 50:
                    break

        if not len(feature_value_representation[feature_name].keys()):
            del feature_value_representation[feature_name]
    cell_content = ""
    cell_content += f"# create_new_cell_with_value_representation(df,df_features,value_limit={value_limit},replace={replace})\n"
    
    cell_content += "feature_value_representation = dict()\n"
    for feature_name,val_repr_dict in feature_value_representation.items():
        
        if len(val_repr_dict.keys()) < value_limit:
            cell_content += f"feature_value_representation[\"{feature_name}\"] = dict()\n"
            for val,reprs in val_repr_dict.items():

                if isinstance(val,str):
                    cell_content += f"feature_value_representation[\"{feature_name}\"][\"{val}\"] = "
                else:
                    cell_content += f"feature_value_representation[\"{feature_name}\"][{val}] = "
                
                if isinstance(reprs,str):
                    cell_content += f"\"{reprs}\"\n"
                else:
                    cell_content += f"{reprs}\n"
        else:
            cell_content += f"\n\n# The feature '{feature_name}' has to many values! Asserting assumption that you don't want to give representation to to each!"
        
        cell_content += "\n"
    create_new_cell(cell_content,
                    replace=replace)

def create_new_cell_with_binned_features(df,
                                         df_features,
                                         bins=5,
                                         replace=True):
    """
    Desc:
        Creates a new cell block with a list of suggested bins and labels for each feature.
    
    Args:
        df:pd.Dataframe
            Pandas DataFrame object.
        
        df_features:
            DataFrameTypes object.
            
        bins:int
            The amount of bins to give to apply to each feature
            
        replace:bool
            Boolean to determine replacing the current cell.
    """
    
    # Add a sort of calling card of the function that created it
    cell_content = f"# create_new_cell_with_binned_features(df,df_features,bins={bins},replace={replace})\n"
    
    for feature_name in df_features.continuous_numerical_features():
        bins,labels = df_auto_binning(df,
                                      df_features,
                                      feature_name,
                                      bins=5)
        cell_content += f"feature_name = \"{feature_name}\"\n"
        cell_content += __format_list_to_string("bins",
                                                bins)
        cell_content += "\n"
        cell_content += __format_list_to_string("labels",
                                                labels)
        
        cell_content += f"\ndf_features.set_feature_binning(feature_name,\n"
        cell_content += "                                bins,\n"
        cell_content += "                                labels)\n"
        cell_content += "\n\n"
    
    create_new_cell(cell_content,
                    replace=replace)

## Declare Project Variables

### Interaction required

In [6]:
dataset_path = "Datasets/train.csv"

# -----
dataset_name = "Home Insurance Cross Sell Prediction"

# -----
inspect_data_project_dir = f"{dataset_name}/Before Cleaning"

In [7]:
# -----
notebook_mode = True

# -----
display_value_counts = True

# Import dataset

In [8]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,381109,12


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0
5,6,Female,24,1,33.0,0,< 1 Year,Yes,2630.0,160.0,176,0
6,7,Male,23,1,11.0,0,< 1 Year,Yes,23367.0,152.0,249,0
7,8,Female,56,1,28.0,0,1-2 Year,Yes,32031.0,26.0,72,1
8,9,Female,24,1,3.0,1,< 1 Year,No,27619.0,152.0,28,0
9,10,Female,32,1,6.0,1,< 1 Year,No,28771.0,152.0,80,0


In [9]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Region_Code,float64
Annual_Premium,float64
Policy_Sales_Channel,float64
id,int64
Age,int64
Driving_License,int64
Previously_Insured,int64
Vintage,int64
Response,int64
Gender,object


## Remove/Declare any unwanted features

### Interaction required

Note: When starting a new project uncomment the function to get suggestions and then run the cell again.

In [10]:
# create_new_cell_with_removal_features(df,replace=True)
removal_features = ["id"]

In [11]:
df.drop(columns=removal_features,
        inplace=True)

In [12]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Region_Code,float64
Annual_Premium,float64
Policy_Sales_Channel,float64
Age,int64
Driving_License,int64
Previously_Insured,int64
Vintage,int64
Response,int64
Gender,object
Vehicle_Age,object


## Gui tools for quick analysis dataframes

Great interface; pauses the program; comment on/off at free will.
You will need to reset kernel after use more than likely.

In [13]:
# from pandasgui import show as qt_display
# qt_display(df)
# %matplotlib inline

In [14]:
# pivot_ui(df,
#          outfile_path='Piviot_Table_JS.html')

# Any basic manipulation of features

#### What I mean by this is say you want to represent a feature slightly different than it is currently displaying.
Note: that whatever maniuplation you do here you should bring to each notebook's section of "Any basic manipulation of features"

## Skim through Value Counts

In [15]:
if display_value_counts:
    for feature_name in df.columns:
        print(f'******* Feature: {feature_name} *******')
        print(f'Type: {df[feature_name].dtype}')
        display(value_counts_table(df,
                                   feature_name))
        print("-------" * 4 + "\n\n")

******* Feature: Gender *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,206089,54.0761%
Female,175020,45.9239%


----------------------------


******* Feature: Age *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
24,25960,6.8117%
23,24256,6.3646%
22,20964,5.5008%
25,20636,5.4147%
21,16457,4.3182%
...,...,...
81,56,0.0147%
82,29,0.0076%
83,22,0.0058%
84,11,0.0029%


----------------------------


******* Feature: Driving_License *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
1,380297,99.7869%
0,812,0.2131%


----------------------------


******* Feature: Region_Code *******
Type: float64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
28.0,106415,27.9225%
8.0,33877,8.8891%
46.0,19749,5.1820%
41.0,18263,4.7921%
15.0,13308,3.4919%
30.0,12191,3.1988%
29.0,11065,2.9034%
50.0,10243,2.6877%
3.0,9251,2.4274%
11.0,9232,2.4224%


----------------------------


******* Feature: Previously_Insured *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,206481,54.1790%
1,174628,45.8210%


----------------------------


******* Feature: Vehicle_Age *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
1-2 Year,200316,52.5613%
< 1 Year,164786,43.2385%
> 2 Years,16007,4.2001%


----------------------------


******* Feature: Vehicle_Damage *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,192413,50.4877%
No,188696,49.5123%


----------------------------


******* Feature: Annual_Premium *******
Type: float64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
2630.0,64877,17.0232%
69856.0,140,0.0367%
39008.0,41,0.0108%
38287.0,38,0.0100%
45179.0,38,0.0100%
...,...,...
62326.0,1,0.0003%
59733.0,1,0.0003%
55934.0,1,0.0003%
75387.0,1,0.0003%


----------------------------


******* Feature: Policy_Sales_Channel *******
Type: float64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
152.0,134784,35.3663%
26.0,79700,20.9127%
124.0,73995,19.4157%
160.0,21779,5.7146%
156.0,10661,2.7974%
...,...,...
144.0,1,0.0003%
149.0,1,0.0003%
84.0,1,0.0003%
143.0,1,0.0003%


----------------------------


******* Feature: Vintage *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
256,1418,0.3721%
73,1410,0.3700%
282,1397,0.3666%
158,1394,0.3658%
187,1392,0.3652%
...,...,...
205,1235,0.3241%
89,1234,0.3238%
32,1230,0.3227%
224,1227,0.3220%


----------------------------


******* Feature: Response *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,334399,87.7437%
1,46710,12.2563%


----------------------------




# Extra removal of seemingly unwanted or unneeded features

In [16]:
unwanted_features = ["Driving_License","Region_Code","Policy_Sales_Channel"]

In [17]:
df.drop(columns=unwanted_features,
        inplace=True)

# Mark target feature; set to None if not needed

### Interaction required

### Supervised learning problems (Can be set to None)

In [18]:
target_feature = "Response"

In [19]:
try:
    if target_feature:
        df[target_feature]
except KeyError:
    raise KeyError(f"The target feature \'{target_feature}\' was not found in the dataframe!"
                   + " Please select a valid feature from the dataframe")

In [20]:
if target_feature:
    print(f"Target feature '{target_feature}'")
    print("----" * 10)
    target_amount = len(df[target_feature].dropna().value_counts().index)
    value_count_df = value_counts_table(df,
                                        target_feature)
    if target_amount < 1:
        display(value_count_df)
    elif target_amount > 25:
        display(value_count_df)
        print("Value count is above 25 asserting that this is probably a continous data stream!")
    else:
        # Change arg 'max_binary_threshold' to see changes in threshold
        max_unbalanced_class_threshold, min_unbalanced_class_threshold = get_unbalanced_threshold(target_amount)
        
        print(f"max_unbalanced_class_threshold = {max_unbalanced_class_threshold * 100:.3f}%")
        print(f"min_unbalanced_class_threshold = {min_unbalanced_class_threshold * 100:.3f}%")
        display(value_count_df)
        index = 0
        for percentage in value_count_df["Percantage"]:
            percentage = float(percentage[:-1])/100
            if percentage >= max_unbalanced_class_threshold or percentage <= min_unbalanced_class_threshold:
                print(f"The value '{value_count_df.index.values[index]}' is causing the target feature to be unbalanced.\n" +
                      "This could cause a model to not properly generalize itself.")
                print("---" * 10 + "\n")

            index += 1

Target feature 'Response'
----------------------------------------
max_unbalanced_class_threshold = 65.000%
min_unbalanced_class_threshold = 35.000%


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,334399,87.7437%
1,46710,12.2563%


The value '0' is causing the target feature to be unbalanced.
This could cause a model to not properly generalize itself.
------------------------------

The value '1' is causing the target feature to be unbalanced.
This could cause a model to not properly generalize itself.
------------------------------



# Load/Init DataFrameTypes object.

This object is used to store an abstracted form of what a feature 'should be' rather than what the pandas dataframe object says it is. In this case we will be specifying all features correct types.

Comment out/remove depending on how you want your design flow to be.

In [21]:
df_features = DataFrameTypes(df,
                             ignore_nulls=True,
                             fix_numeric_features=True,
                             fix_string_features=True,
                             target_feature=target_feature,
                             notebook_mode=notebook_mode)


Moving feature 'Vehicle_Damage' to type bool.


## Make any changes to 'df_features' that automated type assertions messed up.

Ex: Sometimes df_features will think a feature is a category when it isn't. Move to proper types.

In [22]:
print("df_features types:")
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

df_features types:


Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Gender,string
Vehicle_Age,string
Previously_Insured,bool
Response,bool
Vehicle_Damage,bool
Annual_Premium,integer
Age,category
Vintage,category


In [23]:
df_features.set_feature_to_bool(feature_name=[])
df_features.set_feature_to_integer(feature_name=["Vintage","Age",])
df_features.set_feature_to_float(feature_name=[])
df_features.set_feature_to_string(feature_name=[])
df_features.set_feature_to_datetime(feature_name=[])
df_features.set_feature_to_categorical(feature_name=[])

In [24]:
print("df_features types:")
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

df_features types:


Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Gender,string
Vehicle_Age,string
Previously_Insured,bool
Response,bool
Vehicle_Damage,bool
Annual_Premium,integer
Age,integer
Vintage,integer


In [25]:
print("Dataframe's types:")
data_types_table(df)

Dataframe's types:


Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Annual_Premium,float64
Age,int64
Previously_Insured,int64
Vintage,int64
Response,int64
Gender,object
Vehicle_Age,object
Vehicle_Damage,object


In [26]:
throw_error = False
for feature in df.columns:
    if feature in df_features.integer_features():
        try:
            df[feature] = df[feature].astype("int")
        except:
            error_msg = f"An error has occured when converting {feature} to an integer series."
            if throw_error:
                raise ValueError(error_msg)
            else:
                print(error_msg)

    elif feature in df_features.float_features():
        try:
            df[feature] = df[feature].astype("float")
        except:
            error_msg = f"An error has occured when converting {feature} to an float series."
            if throw_error:
                raise ValueError(error_msg)
            else:
                print(error_msg)
                
    elif feature in df_features.categorical_features():
        try:
            df[feature] = df[feature].astype('category')
        except:
            error_msg = f"An error has occured when converting {feature} to an category series."
            if throw_error:
                raise ValueError(error_msg)
            else:
                print(error_msg)

    elif feature in df_features.datetime_features():
        try:
            df[feature] = pd.to_datetime(df[feature])
        except:
            error_msg = f"An error has occured when converting {feature} to an datetime."
            if throw_error:
                raise ValueError(error_msg)
            else:
                print(error_msg)
                
    elif feature in df_features.string_features():
        try:
            df[feature] = df[feature].astype("object")
        except:
            error_msg = f"An error has occured when converting {feature} to an string."
            if throw_error:
                raise ValueError(error_msg)
            else:
                print(error_msg)

In [27]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,Yes,40454,217,1
1,Male,76,0,1-2 Year,No,33536,183,0
2,Male,47,0,> 2 Years,Yes,38294,27,1
3,Male,21,1,< 1 Year,No,28619,203,0
4,Female,29,1,< 1 Year,No,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,No,30170,88,0
381105,Male,30,1,< 1 Year,No,40016,131,0
381106,Male,21,1,< 1 Year,No,35118,161,0
381107,Female,68,0,> 2 Years,Yes,44617,74,0


# Colors and palletes for features

### Remove any unwanted values found or any unwanted features to be color coded.

In [28]:
# create_new_cell_with_feature_value_color_dict(df,df_features,value_limit=50,replace=True)
feature_value_color_dict=dict()
feature_value_color_dict["Vehicle_Age"] = dict()
feature_value_color_dict["Vehicle_Age"]["> 2 Years"] = "#ed1d25"
feature_value_color_dict["Vehicle_Age"]["1-2 Year"] = "#0065b2"
feature_value_color_dict["Vehicle_Age"]["< 1 Year"] = "#07a551"

feature_value_color_dict["Response"] = dict()
feature_value_color_dict["Response"][0] = "#ff8585"
feature_value_color_dict["Response"][1] = "#55a868"

feature_value_color_dict["Previously_Insured"] = dict()
feature_value_color_dict["Previously_Insured"][0] = "#ff8585"
feature_value_color_dict["Previously_Insured"][1] = "#55a868"

feature_value_color_dict["Vehicle_Damage"] = dict()
feature_value_color_dict["Vehicle_Damage"]["Yes"] = "#55a868"
feature_value_color_dict["Vehicle_Damage"]["No"] = "#ff8585"

feature_value_color_dict["Gender"] = dict()
feature_value_color_dict["Gender"]["Male"] = "#7eaed3"
feature_value_color_dict["Gender"]["Female"] = "#ffb6c1"

In [29]:
cleaning_widget = ColorLabelingWidget()
cleaning_widget.run_widget(feature_value_color_dict)

interactive(children=(Select(description='Features', layout=Layout(height='175px', width='50%'), options=('Veh…

### Reinitialize feature color dictionary

In [30]:
feature_value_color_dict = cleaning_widget.get_feature_value_color_dict()

In [31]:
feature_value_color_dict

{'Vehicle_Age': {'> 2 Years': '#ed1d25',
  '1-2 Year': '#0065b2',
  '< 1 Year': '#07a551'},
 'Response': {0: '#ff8585', 1: '#55a868'},
 'Previously_Insured': {0: '#ff8585', 1: '#55a868'},
 'Vehicle_Damage': {'Yes': '#55a868', 'No': '#ff8585'},
 'Gender': {'Male': '#7eaed3', 'Female': '#ffb6c1'}}

In [32]:
df_features.set_feature_colors(feature_value_color_dict)

# Value Reprsentation

It's good practice to describe our data as best as possible. Instead of values being abbreviation forms of their actual value.
Ex: M = Male

In [33]:
# create_new_cell_with_value_representation(df,df_features,value_limit=50,replace=True)
feature_value_representation = dict()
feature_value_representation["Gender"] = dict()
feature_value_representation["Gender"]["Female"] = "F"
feature_value_representation["Gender"]["Male"] = "M"

In [34]:
df_features.set_feature_value_representation(feature_value_representation)

# Bin any numerical values

In [35]:
# create_new_cell_with_binned_features(df,df_features,bins=5,replace=True)
feature_name = "Vintage"
bins = [8.999999,67.000001,125.000001,183.000001,241.000001,299.000001]
labels = ["9 ⟷ 67","68 ⟷ 125","126 ⟷ 183","184 ⟷ 241","242 ⟷ 299"]
df_features.set_feature_binning(feature_name,
                                bins,
                                labels)


feature_name = "Age"
bins = [18.999999,33.000001,46.000001,59.000001,72.000001,85.000001]
labels = ["19 ⟷ 33","34 ⟷ 46","47 ⟷ 59","60 ⟷ 72","73 ⟷ 85"]
df_features.set_feature_binning(feature_name,
                                bins,
                                labels)


feature_name = "Annual_Premium"
bins = [2091.999999,110137.000001,217644.000001,325151.000001,432658.000001,540165.000001
        ]
labels = ["2092 ⟷ 110137","110138 ⟷ 217644","217645 ⟷ 325151","325152 ⟷ 432658",
          "432659 ⟷ 540165"]
df_features.set_feature_binning(feature_name,
                                bins,
                                labels)


feature_name = "Policy_Sales_Channel"
bins = [-1e-06,33.000001,65.000001,98.000001,130.000001,163.000001]
labels = ["0 ⟷ 33","34 ⟷ 65","66 ⟷ 98","99 ⟷ 130","131 ⟷ 163"]
df_features.set_feature_binning(feature_name,
                                bins,
                                labels)




# Test encoding and value reprsentation

In [36]:
data_encoder = DataEncoder(create_file=False)

In [37]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,Yes,40454,217,1
1,Male,76,0,1-2 Year,No,33536,183,0
2,Male,47,0,> 2 Years,Yes,38294,27,1
3,Male,21,1,< 1 Year,No,28619,203,0
4,Female,29,1,< 1 Year,No,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,No,30170,88,0
381105,Male,30,1,< 1 Year,No,40016,131,0
381106,Male,21,1,< 1 Year,No,35118,161,0
381107,Female,68,0,> 2 Years,Yes,44617,74,0


In [38]:
data_encoder.apply_value_representation(df,
                                        df_features)

In [39]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,M,44,0,> 2 Years,Yes,40454,217,1
1,M,76,0,1-2 Year,No,33536,183,0
2,M,47,0,> 2 Years,Yes,38294,27,1
3,M,21,1,< 1 Year,No,28619,203,0
4,F,29,1,< 1 Year,No,27496,39,0
...,...,...,...,...,...,...,...,...
381104,M,74,1,1-2 Year,No,30170,88,0
381105,M,30,1,< 1 Year,No,40016,131,0
381106,M,21,1,< 1 Year,No,35118,161,0
381107,F,68,0,> 2 Years,Yes,44617,74,0


In [40]:
data_encoder.revert_value_representation(df,
                                         df_features)

In [41]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,Yes,40454,217,1
1,Male,76,0,1-2 Year,No,33536,183,0
2,Male,47,0,> 2 Years,Yes,38294,27,1
3,Male,21,1,< 1 Year,No,28619,203,0
4,Female,29,1,< 1 Year,No,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,No,30170,88,0
381105,Male,30,1,< 1 Year,No,40016,131,0
381106,Male,21,1,< 1 Year,No,35118,161,0
381107,Female,68,0,> 2 Years,Yes,44617,74,0


In [42]:
data_encoder.make_values_bool(df,
                              df_features)

In [43]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,1,40454,217,1
1,Male,76,0,1-2 Year,0,33536,183,0
2,Male,47,0,> 2 Years,1,38294,27,1
3,Male,21,1,< 1 Year,0,28619,203,0
4,Female,29,1,< 1 Year,0,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,0,30170,88,0
381105,Male,30,1,< 1 Year,0,40016,131,0
381106,Male,21,1,< 1 Year,0,35118,161,0
381107,Female,68,0,> 2 Years,1,44617,74,0


In [44]:
data_encoder.revert_value_representation(df,
                                         df_features)

In [45]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,1,40454,217,1
1,Male,76,0,1-2 Year,0,33536,183,0
2,Male,47,0,> 2 Years,1,38294,27,1
3,Male,21,1,< 1 Year,0,28619,203,0
4,Female,29,1,< 1 Year,0,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,0,30170,88,0
381105,Male,30,1,< 1 Year,0,40016,131,0
381106,Male,21,1,< 1 Year,0,35118,161,0
381107,Female,68,0,> 2 Years,1,44617,74,0


In [46]:
data_encoder.encode_data(df,
                         df_features,
                         apply_value_representation=True)

In [47]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,Male,44,0,> 2 Years,1,40454,217,1
1,Male,76,0,1-2 Year,0,33536,183,0
2,Male,47,0,> 2 Years,1,38294,27,1
3,Male,21,1,< 1 Year,0,28619,203,0
4,Female,29,1,< 1 Year,0,27496,39,0
...,...,...,...,...,...,...,...,...
381104,Male,74,1,1-2 Year,0,30170,88,0
381105,Male,30,1,< 1 Year,0,40016,131,0
381106,Male,21,1,< 1 Year,0,35118,161,0
381107,Female,68,0,> 2 Years,1,44617,74,0


In [48]:
data_encoder.decode_data(df,
                         df_features,
                         apply_value_representation=True)

In [49]:
df

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response
0,M,44,0,> 2 Years,1,40454,217,1
1,M,76,0,1-2 Year,0,33536,183,0
2,M,47,0,> 2 Years,1,38294,27,1
3,M,21,1,< 1 Year,0,28619,203,0
4,F,29,1,< 1 Year,0,27496,39,0
...,...,...,...,...,...,...,...,...
381104,M,74,1,1-2 Year,0,30170,88,0
381105,M,30,1,< 1 Year,0,40016,131,0
381106,M,21,1,< 1 Year,0,35118,161,0
381107,F,68,0,> 2 Years,1,44617,74,0


In [50]:
df_features.string_features()

{'Gender', 'Vehicle_Age'}

In [51]:
qualitative_features = df_features.string_features() | df_features.categorical_features()
qualitative_features

{'Gender', 'Vehicle_Age'}

In [52]:
data_encoder.make_dummies(df,
                          df_features,
                          qualitative_features=qualitative_features)

In [53]:
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender_F,Gender_M,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years
0,44,0,1,40454,217,1,False,True,False,False,True
1,76,0,0,33536,183,0,False,True,True,False,False
2,47,0,1,38294,27,1,False,True,False,False,True
3,21,1,0,28619,203,0,False,True,False,True,False
4,29,1,0,27496,39,0,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
381104,74,1,0,30170,88,0,False,True,True,False,False
381105,30,1,0,40016,131,0,False,True,False,True,False
381106,21,1,0,35118,161,0,False,True,False,True,False
381107,68,0,1,44617,74,0,True,False,False,False,True


In [54]:
data_encoder.revert_dummies(df,
                            df_features,
                            qualitative_features=qualitative_features)

In [55]:
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender,Vehicle_Age
0,44,0,1,40454,217,1,M,> 2 Years
1,76,0,0,33536,183,0,M,1-2 Year
2,47,0,1,38294,27,1,M,> 2 Years
3,21,1,0,28619,203,0,M,< 1 Year
4,29,1,0,27496,39,0,F,< 1 Year
...,...,...,...,...,...,...,...,...
381104,74,1,0,30170,88,0,M,1-2 Year
381105,30,1,0,40016,131,0,M,< 1 Year
381106,21,1,0,35118,161,0,M,< 1 Year
381107,68,0,1,44617,74,0,F,> 2 Years


In [56]:
df_features.display_features()

String Features: {'Gender', 'Vehicle_Age'}

------------------------------------------------------------------------------------------
Bool Features: {'Previously_Insured', 'Response', 'Vehicle_Damage'}

------------------------------------------------------------------------------------------
Numerical Features: {'Annual_Premium', 'Age', 'Vintage'}

Integer Features: {'Annual_Premium', 'Age', 'Vintage'}

------------------------------------------------------------------------------------------
Target Feature: Response



# Decode and apply value reprsentation for feature analysis

In [57]:
data_encoder.apply_value_representation(df,
                                        df_features)

In [58]:
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender,Vehicle_Age
0,44,0,1,40454,217,1,M,> 2 Years
1,76,0,0,33536,183,0,M,1-2 Year
2,47,0,1,38294,27,1,M,> 2 Years
3,21,1,0,28619,203,0,M,< 1 Year
4,29,1,0,27496,39,0,F,< 1 Year
...,...,...,...,...,...,...,...,...
381104,74,1,0,30170,88,0,M,1-2 Year
381105,30,1,0,40016,131,0,M,< 1 Year
381106,21,1,0,35118,161,0,M,< 1 Year
381107,68,0,1,44617,74,0,F,> 2 Years


In [59]:
data_encoder.decode_data(df,
                         df_features)

In [60]:
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender,Vehicle_Age
0,44,0,1,40454,217,1,M,> 2 Years
1,76,0,0,33536,183,0,M,1-2 Year
2,47,0,1,38294,27,1,M,> 2 Years
3,21,1,0,28619,203,0,M,< 1 Year
4,29,1,0,27496,39,0,F,< 1 Year
...,...,...,...,...,...,...,...,...
381104,74,1,0,30170,88,0,M,1-2 Year
381105,30,1,0,40016,131,0,M,< 1 Year
381106,21,1,0,35118,161,0,M,< 1 Year
381107,68,0,1,44617,74,0,F,> 2 Years


In [61]:
df_features.display_features()

String Features: {'Gender', 'Vehicle_Age'}

------------------------------------------------------------------------------------------
Bool Features: {'Previously_Insured', 'Response', 'Vehicle_Damage'}

------------------------------------------------------------------------------------------
Numerical Features: {'Annual_Premium', 'Age', 'Vintage'}

Integer Features: {'Annual_Premium', 'Age', 'Vintage'}

------------------------------------------------------------------------------------------
Target Feature: Response



## Create a json file of df_features

In [62]:
created_dir = create_dir_structure(os.getcwd(),
                                   f"/eflow Data/{dataset_name}")

In [63]:
df_features.create_json_file_representation(created_dir,
                                            "df_features.json")

In [64]:
# 1/0

ZeroDivisionError: division by zero

In [65]:
# df_features = DataFrameTypes()
# df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [66]:
# df_features.display_features()

# Feature Analysis of feature data

In [67]:
feature_analysis = FeatureAnalysis(df_features,
                                   project_sub_dir=inspect_data_project_dir)
feature_analysis.perform_analysis(df,
                                  dataset_name="Full " + dataset_name,
                                  target_features=[df_features.target_feature()],
                                  suppress_runtime_errors=True,
                                  display_print=False,
                                  display_visuals=False,
                                  dataframe_snapshot=True,
                                  statistical_analysis_on_aggregates=True)



# Get P-value summary on aggerations info for stastical methods

In [68]:
infile = open(feature_analysis.folder_path + "Full " + dataset_name + "/_Extras/Statistics/Stat methods of features dataframes.pkl",'rb')
stat_methods_dict = pickle.load(infile)
infile.close()

for stats_method in stat_methods_dict.keys():
    print(stats_method)
    display(stat_methods_dict[stats_method].round(6))
    all_feature_relationship = set()
    for feature_relationship in stat_methods_dict[stats_method][:10].index.to_list():
        for feature in feature_relationship.split(" compared to "):
            all_feature_relationship.add(feature)
    print(all_feature_relationship)

del stat_methods_dict

Kolmogorov-Smirnov statistic


  and should_run_async(code)


Unnamed: 0,mean,std,var
Response compared to Previously_Insured,0.0,0.0,0.0
Response compared to Vehicle_Damage,0.0,0.0,0.0
Response compared to Age,0.0,0.0,0.0
Response compared to Vehicle_Age,0.0,0.0,0.0
Response compared to Gender,0.0,0.0,0.0
Response compared to Annual_Premium,0.0,0.0,0.0
Response compared to Vintage,0.793601,0.193027,0.03726


{'Previously_Insured', 'Response', 'Gender', 'Age', 'Vehicle_Age', 'Annual_Premium', 'Vintage', 'Vehicle_Damage'}


# Get entropy table

In [69]:
infile = open(feature_analysis.folder_path + "Full " + dataset_name + "/_Extras/Statistics/Entropy Table.pkl",'rb')
entropy_table = pickle.load(infile)
infile.close()
entropy_table

  and should_run_async(code)


Unnamed: 0_level_0,Entropy
Features,Unnamed: 1_level_1
Response,0.372001
Previously_Insured,0.68965
Gender,0.689821
Vehicle_Damage,0.6931
Vehicle_Age,0.833743


In [70]:
del feature_analysis

  and should_run_async(code)


# Null Analysis of data

In [71]:
null_analysis = NullAnalysis(df_features,
                             project_sub_dir=inspect_data_project_dir,
                             notebook_mode=notebook_mode)

null_analysis.perform_analysis(df,
                               dataset_name="Full " + dataset_name,
                               null_features_only=True,
                               display_visuals=True,
                               display_print=False,
                               dataframe_snapshot=True)

Exiting perform data_analysis function call


In [72]:
missing_table = missing_values_table(df)
display(missing_table)
nan_features = missing_table[missing_table["% of Total Values"] > 15].index.to_list()
nan_features

Unnamed: 0_level_0,Missing Values,% of Total Values
Features,Unnamed: 1_level_1,Unnamed: 2_level_1


[]

In [73]:
null_analysis.feature_analysis_of_null_data(df,
                                            "Full " + dataset_name,
                                            target_features=[df_features.target_feature()],
                                            display_visuals=False,
                                            display_print=False,
                                            save_file=True,
                                            suppress_runtime_errors=True,
                                            aggregate_target_feature=True,
                                            extra_tables=True,
                                            nan_features=nan_features)

  and should_run_async(code)


In [74]:
del null_analysis

# Analyze data after binning

In [75]:
continuous_numerical_features = df_features.continuous_numerical_features()

In [76]:
data_encoder.apply_binning(df,
                           df_features,
                           continuous_numerical_features)

In [77]:
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender,Vehicle_Age
0,34 ⟷ 46,0,1,2092 ⟷ 110137,184 ⟷ 241,1,M,> 2 Years
1,73 ⟷ 85,0,0,2092 ⟷ 110137,126 ⟷ 183,0,M,1-2 Year
2,47 ⟷ 59,0,1,2092 ⟷ 110137,9 ⟷ 67,1,M,> 2 Years
3,19 ⟷ 33,1,0,2092 ⟷ 110137,184 ⟷ 241,0,M,< 1 Year
4,19 ⟷ 33,1,0,2092 ⟷ 110137,9 ⟷ 67,0,F,< 1 Year
...,...,...,...,...,...,...,...,...
381104,73 ⟷ 85,1,0,2092 ⟷ 110137,68 ⟷ 125,0,M,1-2 Year
381105,19 ⟷ 33,1,0,2092 ⟷ 110137,126 ⟷ 183,0,M,< 1 Year
381106,19 ⟷ 33,1,0,2092 ⟷ 110137,126 ⟷ 183,0,M,< 1 Year
381107,60 ⟷ 72,0,1,2092 ⟷ 110137,68 ⟷ 125,0,F,> 2 Years


# Feature Analysis of feature data after binning

In [78]:
feature_analysis = FeatureAnalysis(df_features,
                                   project_sub_dir=inspect_data_project_dir)
feature_analysis.perform_analysis(df,
                                  dataset_name= "Binned Continuous " + dataset_name,
                                  target_features=[df_features.target_feature()],
                                  suppress_runtime_errors=True,
                                  display_print=False,
                                  display_visuals=False,
                                  dataframe_snapshot=False,
                                  selected_features=continuous_numerical_features,
                                  statistical_analysis_on_aggregates=False)

  and should_run_async(code)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0.0, flags=flags

# Get entropy table

In [79]:
infile = open(feature_analysis.folder_path + "Binned Continuous " + dataset_name + "/_Extras/Statistics/Entropy Table.pkl",'rb')
entropy_table = pickle.load(infile)
infile.close()
entropy_table

  and should_run_async(code)


Unnamed: 0_level_0,Entropy
Features,Unnamed: 1_level_1
Annual_Premium,0.010583
Response,0.372001
Previously_Insured,0.68965
Gender,0.689821
Vehicle_Damage,0.6931
Vehicle_Age,0.833743
Age,1.329928
Vintage,1.609434


In [80]:
del feature_analysis

  and should_run_async(code)


# Null Analysis of data after binning

In [81]:
null_analysis = NullAnalysis(df_features,
                             project_sub_dir=inspect_data_project_dir,
                             notebook_mode=notebook_mode)

In [82]:
null_analysis.feature_analysis_of_null_data(df,
                                            "Binned Continuous " + dataset_name,
                                            target_features=[df_features.target_feature()],
                                            display_visuals=False,
                                            display_print=False,
                                            save_file=True,
                                            selected_features=continuous_numerical_features,
                                            suppress_runtime_errors=True,
                                            aggregate_target_feature=True,
                                            extra_tables=True,
                                            nan_features=nan_features,
                                            statistical_analysis_on_aggregates=False)

In [83]:
del null_analysis

In [84]:
remove_unconnected_pipeline_segments()