In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os

## Exploratory Data Analysis

### Mounting drive for further analysis.

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Converting raw data to .csv files for analysis and merging

In [12]:
import urllib.request as urllib
import numpy as np

url_cleaveland = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
raw_data = urllib.urlopen(url_cleaveland)
with open('heart_disease_cleaveland_processed.csv', 'wb') as file:
    file.write(raw_data.read())

url_hungarian = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data"
raw_data = urllib.urlopen(url_hungarian)
with open('heart_disease_hungarian_processed.csv', 'wb') as file:
    file.write(raw_data.read())

url_switzerland = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data"
raw_data = urllib.urlopen(url_switzerland)
with open('heart_disease_switzerland_processed.csv', 'wb') as file:
    file.write(raw_data.read())




### Loading Datasets 

In [14]:
heart_labels = ['age','sex','chest_pain_type','resting blood pressure','cholestoral','fasting_blood_sugar','ekg_results','max_hr','exercise_angina','ST_depression','slope_of_st','vessels','thallium','heart_disease']
hf = pd.read_csv("heart_disease_hungarian_processed.csv", names= heart_labels)
cf = pd.read_csv("heart_disease_cleaveland_processed.csv", names= heart_labels)
sf = pd.read_csv("heart_disease_switzerland_processed.csv", names= heart_labels)

### Hungarian Data of top and bottom 5 rows

In [5]:
hf.head()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [6]:
hf.tail()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


### Cleaveland Data of top and bottom 5 rows

In [15]:
cf.head()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [16]:
cf.tail()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


### Switzerland Data top and bottom 5 rows

In [17]:
sf.head()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


In [18]:
sf.tail()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,vessels,thallium,heart_disease
118,70,1,4,115,0,0,1,92,1,0.0,2,?,7,1
119,70,1,4,140,0,1,0,157,1,2.0,2,?,7,3
120,72,1,3,160,0,?,2,114,0,1.6,2,2,?,0
121,73,0,3,160,0,0,1,121,0,0.0,1,?,3,1
122,74,1,2,145,0,?,1,123,0,1.3,1,?,?,1


In [7]:
#Features in the Dataset Heart Disease dataset
hf.columns

Index(['age', 'sex', 'chest_pain_type', 'resting blood pressure',
       'cholestoral', 'fasting_blood_sugar', 'ekg_results', 'max_hr',
       'exercise_angina', 'ST_depression', 'slope_of_st', 'vessels',
       'thallium', 'heart_disease'],
      dtype='object')

In [8]:
#Shape of the dataset
hf.shape

(303, 14)

In [9]:
# Summary of Statistics
hf.describe()

Unnamed: 0,age,sex,chest_pain_type,resting blood pressure,cholestoral,fasting_blood_sugar,ekg_results,max_hr,exercise_angina,ST_depression,slope_of_st,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


### Hungarian Data

In [21]:
#info of data types
hf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     294 non-null    int64  
 1   sex                     294 non-null    int64  
 2   chest_pain_type         294 non-null    int64  
 3   resting blood pressure  294 non-null    object 
 4   cholestoral             294 non-null    object 
 5   fasting_blood_sugar     294 non-null    object 
 6   ekg_results             294 non-null    object 
 7   max_hr                  294 non-null    object 
 8   exercise_angina         294 non-null    object 
 9   ST_depression           294 non-null    float64
 10  slope_of_st             294 non-null    object 
 11  vessels                 294 non-null    object 
 12  thallium                294 non-null    object 
 13  heart_disease           294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory 

### Switzerland Data

In [19]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   age                     123 non-null    int64 
 1   sex                     123 non-null    int64 
 2   chest_pain_type         123 non-null    int64 
 3   resting blood pressure  123 non-null    object
 4   cholestoral             123 non-null    int64 
 5   fasting_blood_sugar     123 non-null    object
 6   ekg_results             123 non-null    object
 7   max_hr                  123 non-null    object
 8   exercise_angina         123 non-null    object
 9   ST_depression           123 non-null    object
 10  slope_of_st             123 non-null    object
 11  vessels                 123 non-null    object
 12  thallium                123 non-null    object
 13  heart_disease           123 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 13.6+ KB


### Cleaveland Data

In [20]:
cf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     303 non-null    float64
 1   sex                     303 non-null    float64
 2   chest_pain_type         303 non-null    float64
 3   resting blood pressure  303 non-null    float64
 4   cholestoral             303 non-null    float64
 5   fasting_blood_sugar     303 non-null    float64
 6   ekg_results             303 non-null    float64
 7   max_hr                  303 non-null    float64
 8   exercise_angina         303 non-null    float64
 9   ST_depression           303 non-null    float64
 10  slope_of_st             303 non-null    float64
 11  vessels                 303 non-null    object 
 12  thallium                303 non-null    object 
 13  heart_disease           303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory

### All the data types are converted to numerical with float64 datatype for merging.