In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
from dateutil.relativedelta import relativedelta

In [2]:
# read the csv
vascular_dataset_df = pd.read_csv("Raw_Resources/cardio-dataset.csv")
vascular_dataset_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
# Confirm there are an equal number of values for each column
vascular_dataset_df.count()

id             70000
age            70000
gender         70000
height         70000
weight         70000
ap_hi          70000
ap_lo          70000
cholesterol    70000
gluc           70000
smoke          70000
alco           70000
active         70000
cardio         70000
dtype: int64

In [4]:
vascular_dataset_df = vascular_dataset_df.rename(columns={'id':'ID',
                                                          'age':'Age (yrs)', 'gender':'Gender',
                                                          'height':'Height (cm)',
                                                          'weight':'Weight (kg)', 'ap_hi':'Systolic BP',
                                                         'ap_lo':'Diastolic BP',
                                                          'cholesterol':'Cholesterol',
                                                         'gluc':'Glucose',
                                                         'smoke':'Smoking',
                                                         'alco':'Alcohol',
                                                         'active':'Physical Activity',
                                                         'cardio':'Cardiovascular Disease'})
vascular_dataset_df.head()

Unnamed: 0,ID,Age (yrs),Gender,Height (cm),Weight (kg),Systolic BP,Diastolic BP,Cholesterol,Glucose,Smoking,Alcohol,Physical Activity,Cardiovascular Disease
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
# Confirm the data types are correct
vascular_dataset_df.dtypes

ID                          int64
Age (yrs)                   int64
Gender                      int64
Height (cm)                 int64
Weight (kg)               float64
Systolic BP                 int64
Diastolic BP                int64
Cholesterol                 int64
Glucose                     int64
Smoking                     int64
Alcohol                     int64
Physical Activity           int64
Cardiovascular Disease      int64
dtype: object

In [6]:
vascular_dataset_df['Gender'] = vascular_dataset_df['Gender'].astype(str)
vascular_dataset_df['Cholesterol'] = vascular_dataset_df['Cholesterol'].astype(str)
vascular_dataset_df['Glucose'] = vascular_dataset_df['Glucose'].astype(str)
vascular_dataset_df['Smoking'] = vascular_dataset_df['Smoking'].astype(str)
vascular_dataset_df['Alcohol'] = vascular_dataset_df['Alcohol'].astype(str)
vascular_dataset_df['Physical Activity'] = vascular_dataset_df['Physical Activity'].astype(str)
vascular_dataset_df['Cardiovascular Disease'] = vascular_dataset_df['Cardiovascular Disease'].astype(str)

In [7]:
vascular_dataset_df.dtypes

ID                          int64
Age (yrs)                   int64
Gender                     object
Height (cm)                 int64
Weight (kg)               float64
Systolic BP                 int64
Diastolic BP                int64
Cholesterol                object
Glucose                    object
Smoking                    object
Alcohol                    object
Physical Activity          object
Cardiovascular Disease     object
dtype: object

In [8]:
# change age to years
vascular_dataset_df['Age (yrs)'] = round(vascular_dataset_df['Age (yrs)'] / 365,0)
vascular_dataset_df['Gender'] = vascular_dataset_df['Gender'].replace({'1':'male', '2':'female'})
vascular_dataset_df['Cholesterol'] = vascular_dataset_df['Cholesterol'].replace({'1':'normal','2':'above normal','3':'well above normal'})
vascular_dataset_df['Glucose'] = vascular_dataset_df['Glucose'].replace({'1':'normal','2':'above normal','3':'well above normal'})
vascular_dataset_df['Smoking'] = vascular_dataset_df['Smoking'].replace({'0':'no', '1':'yes'})
vascular_dataset_df['Alcohol'] = vascular_dataset_df['Alcohol'].replace({'0':'no', '1':'yes'})
vascular_dataset_df['Physical Activity'] = vascular_dataset_df['Physical Activity'].replace({'0':'no', '1':'yes'})
vascular_dataset_df['Cardiovascular Disease'] = vascular_dataset_df['Cardiovascular Disease'].replace({'0':'no', '1':'yes'})


vascular_dataset_df.head()

Unnamed: 0,ID,Age (yrs),Gender,Height (cm),Weight (kg),Systolic BP,Diastolic BP,Cholesterol,Glucose,Smoking,Alcohol,Physical Activity,Cardiovascular Disease
0,0,50.0,female,168,62.0,110,80,normal,normal,no,no,yes,no
1,1,55.0,male,156,85.0,140,90,well above normal,normal,no,no,yes,yes
2,2,52.0,male,165,64.0,130,70,well above normal,normal,no,no,no,yes
3,3,48.0,female,169,82.0,150,100,normal,normal,no,no,yes,yes
4,4,48.0,male,156,56.0,100,60,normal,normal,no,no,no,no


In [9]:
vascular_dataset_df.to_csv('Clean_Resources/cardiovascular.csv')