In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
%load_ext kedro.ipython

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [12]:
catalog

[1m<[0m[1;95mkedro.io.data_catalog.DataCatalog[0m[39m object at [0m[1;36m0x000002112F77F490[0m[1m>[0m

In [13]:
# display all columns
pd.set_option('display.max_columns', None)

In [14]:
df = catalog.load("preprocessed_data")
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,0-40,1,2,140,high,0,0,172,0,0.0,1,0
1,41-50,0,3,160,normal,0,0,156,0,1.0,2,1
2,0-40,1,2,130,high,0,1,98,0,0.0,1,0
3,41-50,0,4,138,borderline high,0,0,108,1,1.5,2,1
4,51-60,1,3,150,normal,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
739,41-50,1,1,110,high,0,0,132,0,1.2,2,1
740,61-70,1,4,144,normal,1,0,141,0,3.4,2,1
741,51-60,1,4,130,normal,0,0,115,1,1.2,2,1
742,51-60,0,2,130,borderline high,0,2,174,0,0.0,2,1


In [15]:
def feature_extraction(df):
    # one-hot encoding age
    df = pd.get_dummies(df, columns=["age"], dtype="int64")
    # sex is already binary
    #one-hot encoding cholesterol
    df = pd.get_dummies(df, columns=["cholesterol"], dtype="int64")
    # one-hot encoding chest pain type
    df = pd.get_dummies(df, columns=["chest pain type"], dtype="int64")
    # one-hot encoding resting ecg
    df = pd.get_dummies(df, columns=["resting ecg"], dtype="int64")
    # one-hot encoding ST slope
    df = pd.get_dummies(df, columns=["ST slope"], dtype="int64")

    return df


df = feature_extraction(df)

In [16]:
df

Unnamed: 0,sex,resting bp s,fasting blood sugar,max heart rate,exercise angina,oldpeak,target,age_0-40,age_41-50,age_51-60,age_61-70,age_71-80,cholesterol_borderline high,cholesterol_high,cholesterol_normal,chest pain type_1,chest pain type_2,chest pain type_3,chest pain type_4,resting ecg_0,resting ecg_1,resting ecg_2,ST slope_0,ST slope_1,ST slope_2,ST slope_3
0,1,140,0,172,0,0.0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
1,0,160,0,156,0,1.0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0
2,1,130,0,98,0,0.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0
3,0,138,0,108,1,1.5,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0
4,1,150,0,122,0,0.0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,1,110,0,132,0,1.2,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0
740,1,144,1,141,0,3.4,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0
741,1,130,0,115,1,1.2,1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0
742,0,130,0,174,0,0.0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0


In [17]:
# Check data types
df.dtypes


sex                              int64
resting bp s                     int64
fasting blood sugar              int64
max heart rate                   int64
exercise angina                  int64
oldpeak                        float64
target                           int64
age_0-[1;36m40[0m                         int64
age_41-[1;36m50[0m                        int64
age_51-[1;36m60[0m                        int64
age_61-[1;36m70[0m                        int64
age_71-[1;36m80[0m                        int64
cholesterol_borderline high      int64
cholesterol_high                 int64
cholesterol_normal               int64
chest pain type_1                int64
chest pain type_2                int64
chest pain type_3                int64
chest pain type_4                int64
resting ecg_0                    int64
resting ecg_1                    int64
resting ecg_2                    int64
ST slope_0                       int64
ST slope_1                       int64
ST slope

In [18]:
df.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'sex'[0m, [32m'resting bp s'[0m, [32m'fasting blood sugar'[0m, [32m'max heart rate'[0m,
       [32m'exercise angina'[0m, [32m'oldpeak'[0m, [32m'target'[0m, [32m'age_0-40'[0m, [32m'age_41-50'[0m,
       [32m'age_51-60'[0m, [32m'age_61-70'[0m, [32m'age_71-80'[0m, [32m'cholesterol_borderline high'[0m,
       [32m'cholesterol_high'[0m, [32m'cholesterol_normal'[0m, [32m'chest pain type_1'[0m,
       [32m'chest pain type_2'[0m, [32m'chest pain type_3'[0m, [32m'chest pain type_4'[0m,
       [32m'resting ecg_0'[0m, [32m'resting ecg_1'[0m, [32m'resting ecg_2'[0m, [32m'ST slope_0'[0m,
       [32m'ST slope_1'[0m, [32m'ST slope_2'[0m, [32m'ST slope_3'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0m

In [19]:
aligned_features_df = catalog.load("aligned_features")

In [20]:
aligned_features_df.dtypes


sex                              int64
resting bp s                     int64
fasting blood sugar              int64
max heart rate                   int64
exercise angina                  int64
oldpeak                        float64
target                           int64
age_0-[1;36m40[0m                         int64
age_41-[1;36m50[0m                        int64
age_51-[1;36m60[0m                        int64
age_61-[1;36m70[0m                        int64
age_71-[1;36m80[0m                        int64
cholesterol_borderline high      int64
cholesterol_high                 int64
cholesterol_normal               int64
chest pain type_1                int64
chest pain type_2                int64
chest pain type_3                int64
chest pain type_4                int64
resting ecg_0                    int64
resting ecg_1                    int64
resting ecg_2                    int64
ST slope_0                       int64
ST slope_1                       int64
ST slope