# Standardisierung

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [17]:
heartData = pd.read_csv("heart-failure-prediction/heart.csv")
heartData.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Umwandlung der Attribute in nummerische Werte

In [18]:
# Umwandlung der Spalte "Sex" in nummerischen Wert
heartData["Sex"] = heartData["Sex"].apply(lambda x: 0 if x == "M" else 1)

# Umwandlung der Spalte "ST_Slope" in nummerischen Wert
heartData["ST_Slope"] = heartData["ST_Slope"].apply(lambda x: 0 if x == "Flat" else (1 if x == "Up" else 2))

# Umwandlung der Spalte "ChestPainType" in nummerischen Wert
heartData["ChestPainType"] = heartData["ChestPainType"].apply(lambda x: 0 if x == "TA" else (1 if x == "ATA" else (2 if x == "NAP" else 3)))

# Umwandlung der Spalte "RestingECG" in nummerischen Wert
heartData["RestingECG"] = heartData["RestingECG"].apply(lambda x: 0 if x == "Normal" else (1 if x == "ST" else 2))

# Umwandlung der Spalte "ExerciseAngina" in nummerischen Wert
heartData["ExerciseAngina"] = heartData["ExerciseAngina"].apply(lambda x: 0 if x == "N" else 1)

heartData

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289,0,0,172,0,0.0,1,0
1,49,1,2,160,180,0,0,156,0,1.0,0,1
2,37,0,1,130,283,0,1,98,0,0.0,1,0
3,48,1,3,138,214,0,0,108,1,1.5,0,1
4,54,0,2,150,195,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0,0,110,264,0,0,132,0,1.2,0,1
914,68,0,3,144,193,1,0,141,0,3.4,0,1
915,57,0,3,130,131,0,0,115,1,1.2,0,1
916,57,1,1,130,236,0,2,174,0,0.0,0,1


In [19]:
# Attribute Werte-Anzahl
features = heartData.drop("HeartDisease", axis=1).copy()
{feature: len(features[feature].unique()) for feature in features.columns}

{'Age': 50,
 'Sex': 2,
 'ChestPainType': 4,
 'RestingBP': 67,
 'Cholesterol': 222,
 'FastingBS': 2,
 'RestingECG': 3,
 'MaxHR': 119,
 'ExerciseAngina': 2,
 'Oldpeak': 53,
 'ST_Slope': 3}

In [20]:
# Entfernen unangegebener Cholersterin Beobachtungen und Frauen
heartData = heartData[(heartData["Sex"] != "F") & (heartData["Cholesterol"] != 0)]

In [21]:
# Nummerische und Kategorie Attribute rausfiltern
numericFeatures = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
categoricalFeatures = ["ChestPainType", "RestingECG", "ST_Slope"]
numericHeartData = heartData.loc[:,numericFeatures].copy()

## Aufspalten der Kategorie-Attribute

In [22]:
#pd.get_dummies(features["ST_Slope"], prefix="ST")

categoricalZip = dict(zip(categoricalFeatures,["CP","ECG","ST"]))
for column, prefix in categoricalZip.items():
    dummies = pd.get_dummies(heartData[column], prefix=prefix)
    heartData = pd.concat([heartData, dummies], axis=1)
    heartData = heartData.drop(column, axis=1)
heartData.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,CP_0,CP_1,CP_2,CP_3,ECG_0,ECG_1,ECG_2,ST_0,ST_1,ST_2
0,40,0,140,289,0,172,0,0.0,0,False,True,False,False,True,False,False,False,True,False
1,49,1,160,180,0,156,0,1.0,1,False,False,True,False,True,False,False,True,False,False
2,37,0,130,283,0,98,0,0.0,0,False,True,False,False,False,True,False,False,True,False
3,48,1,138,214,0,108,1,1.5,1,False,False,False,True,True,False,False,True,False,False
4,54,0,150,195,0,122,0,0.0,0,False,False,True,False,True,False,False,False,True,False


## Skalierung

In [24]:
X = heartData.drop("HeartDisease", axis=1).copy()
# Skalieren von 0 bis 1
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)
X

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,CP_0,CP_1,CP_2,CP_3,ECG_0,ECG_1,ECG_2,ST_0,ST_1,ST_2
0,0.244898,0.0,0.444444,0.393822,0.0,0.774436,0.0,0.015873,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.428571,1.0,0.629630,0.183398,0.0,0.654135,0.0,0.174603,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.183673,0.0,0.351852,0.382239,0.0,0.218045,0.0,0.015873,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.408163,1.0,0.425926,0.249035,0.0,0.293233,1.0,0.253968,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.530612,0.0,0.537037,0.212355,0.0,0.398496,0.0,0.015873,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,0.346939,0.0,0.166667,0.345560,0.0,0.473684,0.0,0.206349,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
742,0.816327,0.0,0.481481,0.208494,1.0,0.541353,0.0,0.555556,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
743,0.591837,0.0,0.351852,0.088803,0.0,0.345865,1.0,0.206349,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
744,0.591837,1.0,0.351852,0.291506,0.0,0.789474,0.0,0.015873,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
