Find the markdown blocks that say interaction required! The notebook should take care of the rest!

# Import libs

In [1]:
import sys
import os
sys.path.append('..')
from eflow.foundation import DataPipeline,DataFrameTypes
from eflow.model_analysis import RegressionAnalysis
from eflow.utils.modeling_utils import optimize_model_grid
from eflow.utils.eflow_utils import get_type_holder_from_pipeline, remove_unconnected_pipeline_segments
from eflow.utils.pandas_utils import data_types_table

import pandas as pd
import numpy as np
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import copy
import pickle
from IPython.display import clear_output

In [2]:
# # Additional add ons
# !pip install pandasgui
# !pip install pivottablejs
# clear_output()

In [3]:
%matplotlib notebook
%matplotlib inline

## Declare Project Variables

### Interaction required

In [4]:
dataset_path = "Datasets/titanic_train.csv"

# -----
dataset_name = "Titanic Data"
pipeline_name = "Titanic Pipeline"

# -----


# -----
notebook_mode = True

## Clean out segment space

In [5]:
remove_unconnected_pipeline_segments()

# Import dataset

In [6]:
df = pd.read_csv(dataset_path)
shape_df = pd.DataFrame.from_dict({'Rows': [df.shape[0]],
                                   'Columns': [df.shape[1]]})
display(shape_df)
display(df.head(30))

Unnamed: 0,Rows,Columns
0,891,12


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
data_types_table(df)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Age,float64
Fare,float64
PassengerId,int64
Survived,int64
Pclass,int64
SibSp,int64
Parch,int64
Name,object
Sex,object
Ticket,object


# Loading and init df_features

In [8]:
# Option: 1
# df_features = get_type_holder_from_pipeline(pipeline_name)

In [9]:
# Option: 2
df_features = DataFrameTypes()
df_features.init_on_json_file(os.getcwd() + f"/eflow Data/{dataset_name}/df_features.json")

In [10]:
df_features.display_features(display_dataframes=True,
                             notebook_mode=notebook_mode)

Unnamed: 0_level_0,Data Types
Features,Unnamed: 1_level_1
Cabin,string
Pclass,string
Embarked,string
Sex,string
Survived,bool
SibSp,integer
Parch,integer
Age,float
Fare,float


# Any extra processing before eflow DataPipeline

# Setup pipeline structure

### Interaction Required

In [11]:
main_pipe = DataPipeline(pipeline_name,
                         df,
                         df_features)

The file 'root_pipeline.json' exist!
Now configuring object with proper pipeline segments...
Removing the feature: "PassengerId"
Removing the feature: "Ticket"
Removing the feature: "Name"


In [12]:
main_pipe.perform_pipeline(df,
                           df_features)

Ignore feature:  Fare


In [13]:
df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_Cherbourg,Embarked_Queenstown,Embarked_Southampton
0,0,22.000000,1,0,7.2500,False,True,False,False,True,False,False,True
1,1,38.000000,1,0,71.2833,True,False,True,False,False,True,False,False
2,1,26.000000,0,0,7.9250,True,False,False,False,True,False,False,True
3,1,35.000000,1,0,53.1000,True,False,True,False,False,False,False,True
4,0,35.000000,0,0,8.0500,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.000000,0,0,13.0000,False,True,False,True,False,False,False,True
887,1,19.000000,0,0,30.0000,True,False,True,False,False,False,False,True
888,0,29.444268,1,2,23.4500,True,False,False,False,True,False,False,True
889,1,26.000000,0,0,30.0000,False,True,True,False,False,True,False,False


# Seperate out data into train and test sets

In [14]:
X = df.drop(columns="Age").values
y = df["Age"].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=517,
)

In [16]:
feature_order = list(df.columns)

In [17]:
del df

# Fit Models and view results

In [18]:
# Find best parameters for model
param_grid = {
    "max_depth": list(range(1, 10)),
    'criterion': ["mse", "friedman_mse", "mae"]
}

model, best_params = optimize_model_grid(
    model=DecisionTreeRegressor(),
    X_train=X_train,
    y_train=y_train,
    param_grid=param_grid,
    scoring="r2"
)

Tuned Parameters: {'criterion': 'mse', 'max_depth': 4}
Best score on trained data was 0.195198


In [19]:
model.fit(X,y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=4,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [20]:
model_name = repr(model).split("(")[0]

In [21]:
model_analysis = RegressionAnalysis(dataset_name=dataset_name,
                                    model=model,
                                    model_name=model_name,
                                    feature_order=feature_order,
                                    target_feature="Age",
                                    pred_funcs_dict={"Predict function":model.predict},
                                    notebook_mode=notebook_mode,
                                    df_features=df_features)

In [22]:
model_analysis.perform_analysis(X=X_train,
                                y=y_train,
                                dataset_name="Train Data",
                                regression_error_analysis=True,
                                regression_correct_analysis=True,
                                display_visuals=False,
                                mse_score=.2)



------------------------------Train Data------------------------------


**********Generating graphs for when the model predicted incorrectly**********

Your model predicted everything correctly for this dataset! No correct analysis needed!
Also sorry for your model...zero correct? Dam...


In [23]:
for pred_name in model_analysis.get_predictions_names():
    for mse_score in [.2]:
        print(f"Prediction name: {pred_name} with mse_score greater than {mse_score}")
        infile = open(model_analysis.folder_path + "Train Data" + f"/{pred_name}/MSE score greater than {mse_score}/Train Data/_Extras/Statistics/Stat methods of features dataframes.pkl",'rb')
        stat_methods_dict = pickle.load(infile)
        infile.close()

        for stats_method in stat_methods_dict.keys():
            print(stats_method)
            display(stat_methods_dict[stats_method].round(6))
            all_feature_relationship = set()
            for feature_relationship in stat_methods_dict[stats_method][:10].index.to_list():
                for feature in feature_relationship.split(" compared to "):
                    all_feature_relationship.add(feature)
            print(all_feature_relationship)
            print("-----" * 12 + "\n\n")

        del stat_methods_dict

Prediction name: Predict function with mse_score greater than 0.2
Kolmogorov-Smirnov statistic


Unnamed: 0,mean,std,var
Age compared to Fare,0.212014,0.328921,0.108189
Age compared to Pclass_3,0.278175,0.374847,0.140511
Age compared to Pclass_1,0.336038,0.410861,0.168807
Age compared to SibSp,0.521157,0.448616,0.201257
Age compared to Survived,0.582191,0.340163,0.115711
Age compared to Sex_female,0.592127,0.374976,0.140607
Age compared to Sex_male,0.592127,0.374976,0.140607
Age compared to Parch,0.592253,0.492108,0.24217
Age compared to Embarked_Southampton,0.790435,0.278766,0.077711
Age compared to Embarked_Cherbourg,0.800117,0.330257,0.10907


{'Pclass_1', 'Parch', 'Fare', 'Embarked_Cherbourg', 'Survived', 'Sex_male', 'Embarked_Southampton', 'SibSp', 'Pclass_3', 'Age', 'Sex_female'}
------------------------------------------------------------




In [24]:
model_analysis.perform_analysis(X=X_test,
                                y=y_test,
                                dataset_name="Test Data",
                                regression_error_analysis=True,
                                regression_correct_analysis=True,
                                display_visuals=False,
                                mse_score=.2)



------------------------------Test Data------------------------------


**********Generating graphs for when the model predicted incorrectly**********



**********Generating graphs for when the model predicted correctly**********



In [25]:
for pred_name in model_analysis.get_predictions_names():
    for mse_score in [.2]:
        print(f"Prediction name: {pred_name} with mse_score less than {mse_score}")
        infile = open(model_analysis.folder_path + "Test Data" + f"/{pred_name}/MSE score less than {mse_score}/Test Data/_Extras/Statistics/Stat methods of features dataframes.pkl",'rb')
        stat_methods_dict = pickle.load(infile)
        infile.close()

        for stats_method in stat_methods_dict.keys():
            print(stats_method)
            display(stat_methods_dict[stats_method].round(6))
            all_feature_relationship = set()
            for feature_relationship in stat_methods_dict[stats_method][:10].index.to_list():
                for feature in feature_relationship.split(" compared to "):
                    all_feature_relationship.add(feature)
            print(all_feature_relationship)
            print("-----" * 12 + "\n\n")

        del stat_methods_dict

Prediction name: Predict function with mse_score less than 0.2
Kolmogorov-Smirnov statistic


Unnamed: 0,mean,std,var
Age compared to Fare,0.38255,0.479251,0.229682
Age compared to SibSp,0.443631,0.44533,0.198319
Age compared to Parch,0.510756,0.375254,0.140815
Age compared to Pclass_1,0.647242,0.383245,0.146877
Age compared to Pclass_3,0.680368,0.368492,0.135786
Age compared to Sex_female,0.866656,0.208564,0.043499
Age compared to Sex_male,0.866656,0.208564,0.043499
Age compared to Embarked_Southampton,0.907629,0.146574,0.021484
Age compared to Survived,1.0,0.0,0.0
Age compared to Pclass_2,1.0,0.0,0.0


{'Pclass_1', 'Sex_female', 'Fare', 'Sex_male', 'Survived', 'Embarked_Southampton', 'SibSp', 'Pclass_3', 'Pclass_2', 'Age', 'Parch'}
------------------------------------------------------------


