Find the markdown blocks that say interaction required! The notebook should take care of the rest!

# Import libs

In [1]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.model_analysis import ClassificationAnalysis
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.pandas_utils import data_types_table
from eflow.utils.sys_utils import get_all_directories_from_path
from eflow.utils.sys_utils import load_pickle_object

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import SCORERS
import copy
import pickle
from IPython.display import clear_output

In [2]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [3]:
%matplotlib notebook
%matplotlib inline

## Declare Project Variables

### Interaction required

In [4]:
dataset_path = "Datasets/titanic_train.csv"

# -----
dataset_name = "Titanic Data"
pipeline_name = "Titanic Pipeline"

# -----


# -----
notebook_mode = True

## Clean out segment space

In [5]:
remove_unconnected_pipeline_segments()

# Import dataset

In [6]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,891,12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Age,float64
Fare,float64
PassengerId,int64
Survived,int64
Pclass,int64
SibSp,int64
Parch,int64
Name,object
Sex,object
Ticket,object


# Loading and init df_features

In [8]:
# Option: 1
# df_features = get_type_holder_from_pipeline(pipeline_name)

In [9]:
# Option: 2
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [10]:
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Embarked,string
Cabin,string
Sex,string
Survived,bool
Parch,integer
SibSp,integer
Fare,float
Age,float
Pclass,category


# Any extra processing before eflow DataPipeline

In [11]:
display({val[0] for val in set(df["Cabin"].dropna().values)})
df["Cabin"] = [val[0] if isinstance(val,str) else val for val in df["Cabin"]]
df["Cabin"]

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'}

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object

# Setup pipeline structure

### Interaction Required

In [12]:
main_pipe = DataPipeline(pipeline_name,
                         df,
                         df_features)

The file 'root_pipeline.json' exist!
Now configuring object with proper pipeline segments...
Removing the feature: "Ticket"
Removing the feature: "PassengerId"
Removing the feature: "Name"


In [13]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C,C


In [14]:
main_pipe.perform_pipeline(df,
                           df_features)

In [15]:
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,7.2500,False,False,True,False,True,False,False,True
1,1,38.0,1,0,71.2833,True,False,False,True,False,True,False,False
2,1,26.0,0,0,7.9250,False,False,True,True,False,False,False,True
3,1,35.0,1,0,53.1000,False,False,True,True,False,True,False,False
4,0,35.0,0,0,8.0500,False,False,True,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,False,False,True,False,True,False,True,False
887,1,19.0,0,0,30.0000,False,False,True,True,False,True,False,False
888,0,28.0,1,2,23.4500,False,False,True,True,False,False,False,True
889,1,26.0,0,0,30.0000,True,False,False,False,True,True,False,False


# Seperate out data into train and test sets

In [16]:
X = df.drop(columns=df_features.target_feature()).values
y = df[df_features.target_feature()].values

In [17]:
feature_order = list(df.columns)

In [18]:
del df

In [19]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=20, warm_start=True)
clf.fit(X)  # fit 10 trees  
clf.set_params(n_estimators=20)  # add 10 more trees  
clf.fit(X)

  warn("Warm-start fitting without increasing n_estimators does not "


IsolationForest(n_estimators=20, warm_start=True)

In [28]:
clf.score_samples(X).max()

-0.3544744705913975

In [25]:
from sklearn.covariance import EllipticEnvelope
cov = EllipticEnvelope(random_state=0,assume_centered=True).fit(X)



In [37]:
clf.score_samples

<bound method IsolationForest.score_samples of IsolationForest(n_estimators=20, warm_start=True)>

In [50]:
from sklearn.svm import OneClassSVM
one_svm = OneClassSVM(gamma='auto').fit(X)
one_svm.decision_function.__code__.co_varnames

('self', 'X', 'dec')

In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

for model in [cov,clf,one_svm]:
    for i in [-1,1]:
        if i == 1:
            print("Inner")
        else:
            print("Outlier")

        tmp_stuff = model.decision_function(X)[model.predict(X) == i].reshape(1,-1)
        print(model)
        print(tmp_stuff)
        print()
        print("Max:{0}\nMin:{1}\n".format(tmp_stuff[0].max(),tmp_stuff[0].min()))

Outlier
EllipticEnvelope(assume_centered=True, random_state=0)
[[-3.42913241e+02 -2.12382402e+01 -3.30657042e+02 -3.94949929e+02
  -2.69468717e+01 -1.31616539e+00 -7.15979143e+01 -2.92785235e-01
  -4.21656058e+01 -6.15932141e+01 -5.77325955e+01 -3.79174752e+02
  -4.11397465e+02 -2.46856798e+01 -2.84193263e+02 -3.33255263e+00
  -1.80430272e+02 -2.05378720e+01 -2.69753077e+02 -3.15995130e+01
  -5.50404039e+01 -2.84193263e+02 -2.39270988e+01 -2.43282365e+03
  -3.28784305e+01 -7.55401605e+01 -1.82806642e+01 -1.96581362e+01
  -8.12639648e+01 -4.10129758e+02 -9.09312345e+01 -4.12233263e+02
  -1.40870514e+02 -1.07320396e+01 -2.84193263e+02 -2.33338202e+01
  -7.17432262e+01 -2.16807798e+01 -3.79216862e+02 -1.85404112e+02
  -2.44698252e+01 -3.05142831e+02 -3.13697968e+02 -7.33352860e+01
  -2.64895230e+01 -2.13096133e+01 -5.08132021e+01 -6.62612196e+02
  -1.32953426e+01 -7.17631845e+01 -7.43218726e+01 -2.89469775e+02
  -4.48897516e-01 -2.33783358e+01 -2.32097887e+01 -4.75840448e+01
  -3.15992633