In [2]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib as mpl
import scipy as sp
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Dataset data/heart.csv

Its features are:

* age : Age of the patient
* sex : Sex of the patient
* exng: exercise induced angina (1 = yes; 0 = no)
* ca: number of major vessels (0-3)
* cp : Chest Pain type chest pain type:
    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)
* chol : cholestoral in mg/dl fetched via BMI sensor
* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* rest_ecg : resting electrocardiographic results:
    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved
* zip: zip code of the patient

In [4]:
heart = pd.read_csv('../data/heart/heart.csv', sep=",")

In [5]:
heart

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,zip
0,63,1.0,3.0,145.0,233.0,,0.0,150.0,0.0,2.3,0.0,0.0,1,26100
1,37,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2,26026
2,41,0.0,1.0,130.0,,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2,26030
3,56,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2,26026
4,57,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2,20133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0.0,0.0,140.0,241.0,0.0,1.0,123.0,1.0,0.2,1.0,0.0,3,26026
299,45,1.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,0.0,3,25100
300,68,1.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,2.0,3,25100
301,57,1.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,3,26026


In [6]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       297 non-null    float64
 2   cp        302 non-null    float64
 3   trtbps    302 non-null    float64
 4   chol      298 non-null    float64
 5   fbs       293 non-null    float64
 6   restecg   302 non-null    float64
 7   thalachh  300 non-null    float64
 8   exng      302 non-null    float64
 9   oldpeak   302 non-null    float64
 10  slp       302 non-null    float64
 11  caa       302 non-null    float64
 12  thall     303 non-null    int64  
 13  zip       303 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 33.3 KB


In [22]:
# Drop the not-assigned rows with dropna -> in particular, all rows with a number of NA-values >= thresh
# thresh is set to the max number of NA-values a row has in the dataset
# Practically, it only removes a line with 11 NA-values
heart = heart.dropna(axis=0, thresh = heart.isnull().sum(axis=1).max())

heart

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,zip
0,63,1.0,3.0,145.0,233.0,,0.0,150.0,0.0,2.3,0.0,0.0,1,26100
1,37,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2,26026
2,41,0.0,1.0,130.0,,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2,26030
3,56,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2,26026
4,57,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2,20133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0.0,0.0,140.0,241.0,0.0,1.0,123.0,1.0,0.2,1.0,0.0,3,26026
299,45,1.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,0.0,3,25100
300,68,1.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,2.0,3,25100
301,57,1.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,3,26026


In [48]:
# We then define the Column Transformer
# For every column think:
# - does it have null values? -> It needs an Imputer, in particular SimpleImputer with strategy mean, most_frequent or other...
# - Is it categorical? -> It needs an Encoder, either OrdinalEncoder or OneHotEncoder
# - Is it numeric? -> It needs Scaling, either MinMaxScaler or StandardScaler

# Note that the columns specified as 3rd value of the tuples should only appear once in the ColumnTransformer. Use Pipeline to apply multiples
# (This is only required to keep column names)

pipeline1 = Pipeline([
    ('imputing', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler())
])

heart_ct = ColumnTransformer(
    transformers=[
        ('simple_imputing_1', SimpleImputer(strategy='most_frequent'), ['sex']),
        ('simple_imputing_2', SimpleImputer(strategy='mean'), ['fbs']), # + chol and thalachh, in pipeline
        ('one_hot_encoding', OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore'), ['restecg', 'cp', 'zip']),
        ('minmax_scaling', MinMaxScaler(), ['age', 'slp', 'caa', 'thall']),
        ('standard_scaling', StandardScaler(), ['trtbps', 'oldpeak']), # + chol and thalachh, in pipeline
        ('imput+scale', pipeline1, ['chol', 'thalachh'])
    ],
    verbose_feature_names_out=False,
)

In [49]:
heart_ct.fit(heart)

In [50]:
processed_heart = pd.DataFrame(heart_ct.transform(heart), columns=heart_ct.get_feature_names_out())

processed_heart

Unnamed: 0,sex,fbs,restecg_1.0,restecg_2.0,cp_1.0,cp_2.0,cp_3.0,zip_25100,zip_26026,zip_26030,zip_26100,age,slp,caa,thall,trtbps,oldpeak,chol,thalachh
0,1.00000,0.14676,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.70833,0.00000,0.00000,0.33333,0.76149,1.08402,-0.27192,0.00892
1,1.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.16667,0.00000,0.00000,0.66667,-0.09405,2.11893,0.05835,1.63148
2,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.25000,1.00000,0.00000,0.66667,-0.09405,0.30784,0.00000,0.97368
3,1.00000,0.00000,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.56250,1.00000,0.00000,0.66667,-0.66442,-0.20961,-0.21364,1.23680
4,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.58333,1.00000,0.00000,0.66667,-0.66442,-0.38209,2.07882,0.57901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.58333,0.50000,0.00000,1.00000,0.47631,-0.72706,-0.11650,-1.17512
298,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,0.00000,0.33333,0.50000,0.00000,1.00000,-1.23478,0.13536,0.33033,-0.78044
299,1.00000,1.00000,1.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.81250,0.50000,0.50000,1.00000,0.70446,2.03268,-1.04902,-0.38576
300,1.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.58333,0.50000,0.25000,1.00000,-0.09405,0.13536,-2.25353,-1.52594


In [51]:
processed_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          302 non-null    float64
 1   fbs          302 non-null    float64
 2   restecg_1.0  302 non-null    float64
 3   restecg_2.0  302 non-null    float64
 4   cp_1.0       302 non-null    float64
 5   cp_2.0       302 non-null    float64
 6   cp_3.0       302 non-null    float64
 7   zip_25100    302 non-null    float64
 8   zip_26026    302 non-null    float64
 9   zip_26030    302 non-null    float64
 10  zip_26100    302 non-null    float64
 11  age          302 non-null    float64
 12  slp          302 non-null    float64
 13  caa          302 non-null    float64
 14  thall        302 non-null    float64
 15  trtbps       302 non-null    float64
 16  oldpeak      302 non-null    float64
 17  chol         302 non-null    float64
 18  thalachh     302 non-null    float64
dtypes: float