In [2]:
# 1. Import the required libraries

import pandas as pd
import numpy as np
import matplotlib

In [13]:
# 2. Set the columns

column_headers = [
    "Age",
    "Sex",
    "ChestPainType",
    "BloodPressure",
    "SerumCholestrol",
    "FastingBloodSugarOver120",
    "ECGAtRest",
    "HasExerciseInducedAngina",
    "MaxHeartRate",
    "STDepression",
    "STSlope",
    "FlouroscopyMajorVessels",
    "Thal",
    "Diagnosis"
]

In [14]:
# 3. Load in the first data-set (Cleveland)

df1 = pd.read_csv(
    "../data-sets/processed.cleveland.data",
    names=column_headers
)

df1.head(5)

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [38]:
# 4. Let's set the dtypes

df1.astype({
    'Age':'int',
    'Sex':'int',
    'ChestPainType':'int',
    'FastingBloodSugarOver120':'bool',
    'ECGAtRest':'int',
    'HasExerciseInducedAngina':'bool',
    'MaxHeartRate':'int',
    'STSlope':'int',
    'Diagnosis':'bool'
})

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63,1,1,145.0,233.0,True,2,True,0,2.3,3,0.0,6.0,False
1,67,1,4,160.0,286.0,False,2,True,1,1.5,2,3.0,3.0,True
2,67,1,4,120.0,229.0,False,2,True,1,2.6,2,2.0,7.0,True
3,37,1,3,130.0,250.0,False,0,True,0,3.5,3,0.0,3.0,False
4,41,0,2,130.0,204.0,False,2,True,0,1.4,1,0.0,3.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110.0,264.0,False,0,True,0,1.2,2,0.0,7.0,True
299,68,1,4,144.0,193.0,True,0,True,0,3.4,2,2.0,7.0,True
300,57,1,4,130.0,131.0,False,0,True,1,1.2,2,1.0,7.0,True
301,57,0,2,130.0,236.0,False,2,True,0,0.0,2,1.0,3.0,True


In [39]:
# 5. Looks like "Thal" doesn't like being assigned as an integer.
#    In other data-sets, several columns had '?' as a value, lets
#    see if it's so here.

df1.loc[df1['Thal'] == '?']

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2


In [42]:
# 6. Yes there is. Get rid of both of them.

df1_1 = df1.drop(axis=1, index=[87,266])

In [61]:
df1_1 = df1_1.astype({
    'Age':'int',
    'Sex':'int',
    'ChestPainType':'int',
    'FastingBloodSugarOver120':'bool',
    'ECGAtRest':'int',
    'HasExerciseInducedAngina':'bool',
    'MaxHeartRate':'int',
    'STSlope':'int',
    'Thal':'float',
    'Diagnosis':'bool'
})

In [62]:
df1_1 = df1_1.astype({
    'Age':'int',
    'Sex':'int',
    'ChestPainType':'int',
    'FastingBloodSugarOver120':'bool',
    'ECGAtRest':'int',
    'HasExerciseInducedAngina':'bool',
    'MaxHeartRate':'int',
    'STSlope':'int',
    'Thal':'int',
    'Diagnosis':'bool'
})

In [63]:
df1_1.head(5)

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63,1,1,145.0,233.0,True,2,True,0,2.3,3,0.0,6,False
1,67,1,4,160.0,286.0,False,2,True,1,1.5,2,3.0,3,True
2,67,1,4,120.0,229.0,False,2,True,1,2.6,2,2.0,7,True
3,37,1,3,130.0,250.0,False,0,True,0,3.5,3,0.0,3,False
4,41,0,2,130.0,204.0,False,2,True,0,1.4,1,0.0,3,False


In [64]:
df1.loc[df1['FlouroscopyMajorVessels'] == '?']

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [65]:
df1_2 = df1_1.drop(axis=1, index=[166,192,287,302])

In [66]:
df1_2.head()

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63,1,1,145.0,233.0,True,2,True,0,2.3,3,0.0,6,False
1,67,1,4,160.0,286.0,False,2,True,1,1.5,2,3.0,3,True
2,67,1,4,120.0,229.0,False,2,True,1,2.6,2,2.0,7,True
3,37,1,3,130.0,250.0,False,0,True,0,3.5,3,0.0,3,False
4,41,0,2,130.0,204.0,False,2,True,0,1.4,1,0.0,3,False


In [69]:
df1_2 = df1_2.astype({'FlouroscopyMajorVessels':'float'})
df1_2 = df1_2.astype({'FlouroscopyMajorVessels':'int'})

In [70]:
df1_2.head()

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63,1,1,145.0,233.0,True,2,True,0,2.3,3,0,6,False
1,67,1,4,160.0,286.0,False,2,True,1,1.5,2,3,3,True
2,67,1,4,120.0,229.0,False,2,True,1,2.6,2,2,7,True
3,37,1,3,130.0,250.0,False,0,True,0,3.5,3,0,3,False
4,41,0,2,130.0,204.0,False,2,True,0,1.4,1,0,3,False


In [72]:
df1_2 = df1_2.astype({'BloodPressure':'float', 'SerumCholestrol':'float', 'STDepression':'float'})

In [73]:
df1_2.head()

Unnamed: 0,Age,Sex,ChestPainType,BloodPressure,SerumCholestrol,FastingBloodSugarOver120,ECGAtRest,HasExerciseInducedAngina,MaxHeartRate,STDepression,STSlope,FlouroscopyMajorVessels,Thal,Diagnosis
0,63,1,1,145.0,233.0,True,2,True,0,2.3,3,0,6,False
1,67,1,4,160.0,286.0,False,2,True,1,1.5,2,3,3,True
2,67,1,4,120.0,229.0,False,2,True,1,2.6,2,2,7,True
3,37,1,3,130.0,250.0,False,0,True,0,3.5,3,0,3,False
4,41,0,2,130.0,204.0,False,2,True,0,1.4,1,0,3,False


In [None]:
df_cleveland_clean