# Import libs

In [1]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline
from eflow.foundation import DataFrameTypes
from eflow.data_analysis import FeatureAnalysis
from eflow.data_analysis import NullAnalysis
from eflow.model_analysis import ClassificationAnalysis
from eflow.data_pipeline_segments import DataTransformer
from eflow.data_pipeline_segments import TypeFixer
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline
from eflow.utils.math_utils import get_unbalanced_threshold
from eflow.utils.eflow_utils import remove_unconnected_pipeline_segments
from eflow.utils.sys_utils import create_dir_structure
from eflow.utils.eflow_utils import create_color_dict_for_features
from eflow.utils.pandas_utils import data_types_table, value_counts_table, suggest_removal_features 

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import copy
from IPython.display import clear_output
from IPython.core.getipython import get_ipython
import ipython_blocking
import nltk

In [2]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [3]:
%matplotlib notebook
%matplotlib inline

## Juypter notebook generating cells

### Important Note: Replace if set to True will remove all the contents of whatever cell it is called in. But it can be undone with a simple CMD + Z. 🙂

## Declare Project Variables

### Interaction required

In [4]:
dataset_path = "Datasets/titanic_train.csv"
dataset_name = "Titanic Data"

# -----
peek_project_dir = f"{dataset_name}/Before Cleaning"

# -----
notebook_mode = True

# Import dataset

In [5]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,891,12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Throwing in garbage data for testing purposes only!!!

In [6]:
df["Sex"][0] = np.nan
df["Sex"][2] = "Blarg"
df["Age"] = df["Age"].astype('object')
df["Age"][33] = "      "
df["Pclass"][50] = "2sdf,asdqw"
df["Pclass"][50] = "4th Class"
df["SibSp"][2] = np.nan
df["Embarked"][2] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [7]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
SibSp,float64
Fare,float64
PassengerId,int64
Survived,int64
Parch,int64
Pclass,object
Name,object
Sex,object
Age,object
Ticket,object


## Remove/Declare any unwanted features

### Interaction required

Note: When starting a new project uncomment the function to get suggestions and then run the cell again.

In [8]:
# create_new_cell_with_removal_features(df,variable_name="removal_features",replace=True)
removal_features = ["Name","PassengerId","Ticket"]

In [9]:
df.drop(columns=removal_features,
        inplace=True)

In [10]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
SibSp,float64
Fare,float64
Survived,int64
Parch,int64
Pclass,object
Sex,object
Age,object
Cabin,object
Embarked,object


## Gui tools for quick analysis dataframes

Great interface; pauses the program; comment on/off at free will.
You will need to reset kernel after use more than likely.

In [11]:
# from pandasgui import show as qt_display
# qt_display(df)
# %matplotlib inline

In [12]:
# pivot_ui(df,
#          outfile_path='Piviot_Table_JS.html')

## Skim through Value Counts

In [13]:
for feature_name in df.columns:
    print(f'******* Feature: {feature_name} *******')
    print(f'Type: {df[feature_name].dtype}')
    display(value_counts_table(df,
                               feature_name))
    print("-------" * 4 + "\n\n")

******* Feature: Survived *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,549,61.6162%
1,342,38.3838%


----------------------------


******* Feature: Pclass *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
3,490,54.9944%
1,216,24.2424%
2,184,20.6510%
4th Class,1,0.1122%


----------------------------


******* Feature: Sex *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
male,576,64.7191%
female,313,35.1685%
Blarg,1,0.1124%


----------------------------


******* Feature: Age *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
24.0,30,4.2017%
22.0,27,3.7815%
18.0,26,3.6415%
30.0,25,3.5014%
28.0,25,3.5014%
...,...,...
0.42,1,0.1401%
,1,0.1401%
0.92,1,0.1401%
53.0,1,0.1401%


----------------------------


******* Feature: SibSp *******
Type: float64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,607,68.2022%
1.0,209,23.4831%
2.0,28,3.1461%
4.0,18,2.0225%
3.0,16,1.7978%
8.0,7,0.7865%
5.0,5,0.5618%


----------------------------


******* Feature: Parch *******
Type: int64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,678,76.0943%
1,118,13.2435%
2,80,8.9787%
5,5,0.5612%
3,5,0.5612%
4,4,0.4489%
6,1,0.1122%


----------------------------


******* Feature: Fare *******
Type: float64


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
8.0500,43,4.8260%
13.0000,42,4.7138%
7.8958,38,4.2649%
7.7500,34,3.8159%
26.0000,31,3.4792%
...,...,...
8.4583,1,0.1122%
9.8375,1,0.1122%
8.3625,1,0.1122%
14.1083,1,0.1122%


----------------------------


******* Feature: Cabin *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
B96 B98,4,1.9608%
G6,4,1.9608%
C23 C25 C27,4,1.9608%
C22 C26,3,1.4706%
F2,3,1.4706%
...,...,...
B19,1,0.4902%
C111,1,0.4902%
B42,1,0.4902%
E46,1,0.4902%


----------------------------


******* Feature: Embarked *******
Type: object


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
S,643,72.3285%
C,168,18.8976%
Q,77,8.6614%
1,1,0.1125%


----------------------------




# Mark target feature; set to None if not needed

### Interaction required

### Supervised learning problems (Can be set to None)

In [14]:
target_feature = "Survived"

In [15]:
try:
    if target_feature:
        df[target_feature]
except KeyError:
    raise KeyError(f"The target feature \'{target_feature}\' was not found in the dataframe!"
                   + " Please select a valid feature from the dataframe")

In [16]:
if target_feature:
    print(f"Target feature '{target_feature}'")
    print("----" * 10)
    target_amount = len(df[target_feature].dropna().value_counts().index)
    value_count_df = value_counts_table(df,
                                        target_feature)
    if target_amount < 1:
        display(value_count_df)
    else:
        max_unbalanced_class_threshold, min_unbalanced_class_threshold = get_unbalanced_threshold(target_amount)
        print(f"max_unbalanced_class_threshold = {max_unbalanced_class_threshold * 100:.3f}%")
        print(f"min_unbalanced_class_threshold = {min_unbalanced_class_threshold * 100:.3f}%")
        display(value_count_df)
        index = 0
        for percentage in value_count_df["Percantage"]:
            percentage = float(percentage[:-1])/100
            if percentage >= max_unbalanced_class_threshold or percentage <= min_unbalanced_class_threshold:
                print(f"The value '{value_count_df.index.values[index]}' is causing the target feature to be unbalanced.\n" +
                      "This could cause a model to not properly generalize itself.")
                print("---" * 10 + "\n")

            index += 1

Target feature 'Survived'
----------------------------------------
max_unbalanced_class_threshold = 65.000%
min_unbalanced_class_threshold = 35.000%


Unnamed: 0_level_0,Counts,Percantage
Unique Values,Unnamed: 1_level_1,Unnamed: 2_level_1
0,549,61.6162%
1,342,38.3838%


# Load/Init DataFrameTypes object.

This object is used to store an abstracted form of what a feature 'should be' rather than what the pandas dataframe object says it is. In this case we will be specifying all features correct types.

Comment out/remove depending on how you want your design flow to be.

In [17]:
df_features = DataFrameTypes(df,
                             ignore_nulls=True,
                             fix_numeric_features=True,
                             fix_string_features=True,
                             target_feature=target_feature,
                             notebook_mode=notebook_mode)


Moving feature 'Age' to type float.

Moving feature 'Pclass' to type category.
           Bool  Numeric  Float  Integer  Categorical
Survived   True     True  False     True        False
Age       False     True   True    False        False
SibSp     False     True  False     True        False
Parch     False     True  False     True        False
Fare      False     True   True    False        False


## Make any changes to 'df_features' that automated type assertions messed up.

Ex: Sometimes df_features will think a feature is a category when it isn't. Move to proper types.

In [18]:
df_features.set_feature_to_bool(feature_name=[])
df_features.set_feature_to_integer(feature_name=[])
df_features.set_feature_to_float(feature_name=[])
df_features.set_feature_to_string(feature_name=[])
df_features.set_feature_to_datetime(feature_name=[])
df_features.set_feature_to_categorical(feature_name=[])

print("df_features types:")
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

In [19]:
print("df_features:")
df_features.display_features()

df_features:
String Features: {'Embarked', 'Sex', 'Cabin'}

Categorical Features: {'Pclass'}

------------------------------------------------------------------------------------------
Bool Features: {'Survived'}

------------------------------------------------------------------------------------------
Numerical Features: {'Parch', 'SibSp', 'Age', 'Fare'}

Integer Features: {'Parch', 'SibSp'}

Float Features: {'Age', 'Fare'}

------------------------------------------------------------------------------------------
Target Feature: Survived



In [20]:
print("Dataframe types:")
data_types_table(df)

Dataframe types:


Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
SibSp,float64
Fare,float64
Survived,int64
Parch,int64
Pclass,object
Sex,object
Age,object
Cabin,object
Embarked,object


# Any basic manipulation of features

#### What I mean by this is say you want to represent a feature slightly different than it is currently displaying.
Note: that whatever maniuplation you do here you should bring to each notebook's section of "Any basic manipulation of features"

In [21]:
display({val[0] for val in set(df["Cabin"].dropna().values)})
df["Cabin"] = [val[0] if isinstance(val,str) else val for val in df["Cabin"]]
df["Cabin"]

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'}

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object

# Colors and palletes for features

### Interaction required

### Remove any unwanted values found or any unwanted features to be color coded.

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericcacciavillani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
# create_new_cell_with_feature_value_color_dict(df,df_features,variable_name="feature_value_color_dict",value_limit=50,replace=True)
feature_value_color_dict=dict()

feature_value_color_dict["Sex"] = dict()
feature_value_color_dict["Sex"]["male"] = "#7eaed3"
feature_value_color_dict["Sex"]["female"] = "#ffb6c1"

feature_value_color_dict["Survived"] = dict()
feature_value_color_dict["Survived"][0] = "#616369"
feature_value_color_dict["Survived"][1] = "#4dad6c"

feature_value_color_dict["Pclass"] = dict()
feature_value_color_dict["Pclass"][1] = "#d4af37"
feature_value_color_dict["Pclass"][2] = "#c0c0c0"
feature_value_color_dict["Pclass"][3] = "#cd7f32"

In [24]:
from eflow.widgets import ColorLabelingWidget

cleaning_widget = ColorLabelingWidget()
cleaning_widget.run_widget(feature_value_color_dict)

interactive(children=(Select(description='Features', layout=Layout(height='175px', width='50%'), options=('Sex…

### Reaccess feature color dictionary

In [25]:
feature_value_color_dict = cleaning_widget.get_feature_value_color_dict()

In [26]:
feature_value_color_dict

{'Sex': {'male': '#7eaed3', 'female': '#ffb6c1'},
 'Survived': {0: '#616369', 1: '#4dad6c'},
 'Pclass': {1: '#d4af37', 2: '#c0c0c0', 3: '#cd7f32'}}

In [27]:
df_features.set_feature_colors(feature_value_color_dict)

# Label categories if possible

### Interaction required

### It's considered good practice to label up your categories with proper labels for graphing/analysis

In [28]:
# create_new_cell_with_categorical_dict(df,df_features,variable_name="categorical_value_dict",replace=True)
categorical_value_dict = dict()
categorical_value_dict["Pclass"] = dict()
categorical_value_dict["Pclass"][1] = "1st Class"
categorical_value_dict["Pclass"][2] = "2nd Class"
categorical_value_dict["Pclass"][3] = "3rd Class"

In [29]:
from sklearn import preprocessing
from eflow._hidden.custom_exceptions import UnsatisfiedRequirments
label_encoder = dict()

categorical_string_features = df_features.get_categorical_features() | df_features.get_string_features()

for feature_name in categorical_string_features:
    
    label_encoder[feature_name] = dict()
    
    feature_values = [str(val) for val in df[feature_name].dropna().value_counts(sort=False).index.to_list()]
    feature_values.sort()
    
    for i in range(0,len(feature_values)):
        try:
            feature_values[i] = int(feature_values[i])
        except ValueError:
            pass
            
    default_categories = [i for i in range(0,len(feature_values))]
    if feature_name in categorical_value_dict.keys():
        for cat, label in categorical_value_dict[feature_name].items():
            if label == "":
                raise UnsatisfiedRequirments(f"Can't replace category value with empty space! Found on feature {feature_name}")
            
            if isinstance(label,int):
                raise UnsatisfiedRequirments(f"Can't replace category value with a number based value! Found on feature {feature_name}")
            
            if not isinstance(cat,int):
                raise UnsatisfiedRequirments(f"Category value must be a number based value! Found on feature {feature_name}")
            
            label_encoder[feature_name][label] = int(cat)
            
            if cat in default_categories:
                default_categories.remove(cat)
                
            if cat in feature_values:
                feature_values.remove(cat)
                
            if label in feature_values:
                feature_values.remove(label)
    
    cat_count = 0
    for val in feature_values:
        
        if isinstance(val,int):
            label_encoder[feature_name][val] = val
        else:
            label_encoder[feature_name][val] = default_categories[cat_count]
            cat_count += 1

    # Inverse dict
    categorical_encoder = dict()
    for feature_name, label_val_dict in label_encoder.items():
        categorical_encoder[feature_name] = dict()
        for label,cat in label_val_dict.items():
            categorical_encoder[feature_name][cat] = label

print(label_encoder)
print("\n\n")
print(categorical_encoder)

{'Sex': {'Blarg': 0, 'female': 1, 'male': 2}, 'Cabin': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}, 'Pclass': {'1st Class': 1, '2nd Class': 2, '3rd Class': 3, '4th Class': 0}, 'Embarked': {1: 1, 'C': 0, 'Q': 1, 'S': 2}}



{'Sex': {0: 'Blarg', 1: 'female', 2: 'male'}, 'Cabin': {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'T'}, 'Pclass': {1: '1st Class', 2: '2nd Class', 3: '3rd Class', 0: '4th Class'}, 'Embarked': {1: 'Q', 0: 'C', 2: 'S'}}


In [30]:
raise sbjfjskadfkjsdnf

NameError: name 'sbjfjskadfkjsdnf' is not defined

# General Analysis of data

In [None]:
feature_analysis = FeatureAnalysis(df_features,
                                   project_sub_dir=peek_project_dir)
feature_analysis.perform_analysis(df,
                                  dataset_name=dataset_name)
del feature_analysis

# Analysis of null data

In [None]:
null_analysis = NullAnalysis(df_features,
                             project_sub_dir=peek_project_dir,
                             notebook_mode=notebook_mode)

null_analysis.perform_analysis(df,
                               dataset_name=dataset_name,
                               null_features_only=True)

del null_analysis

## Create a json file of df_features

In [None]:
created_dir = create_dir_structure(os.getcwd(),
                                   f"/eflow Data/{dataset_name}")

In [None]:
df_features.create_json_file_representation(created_dir,
                                            "df_features.json")

In [None]:
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [None]:
df_features.display_features()