# Project - 4 Heart Disease Prediction

In [1]:
# Import dependencies
from pathlib import Path
import pandas as pd

In [2]:
data = Path("Resources/CVD_cleaned.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


# Data Pre Processing

In [3]:
# To check the unique values in each column
df.nunique()

General_Health                     5
Checkup                            5
Exercise                           2
Heart_Disease                      2
Skin_Cancer                        2
Other_Cancer                       2
Depression                         2
Diabetes                           4
Arthritis                          2
Sex                                2
Age_Category                      13
Height_(cm)                       99
Weight_(kg)                      525
BMI                             3654
Smoking_History                    2
Alcohol_Consumption               31
Fruit_Consumption                 77
Green_Vegetables_Consumption      75
FriedPotato_Consumption           69
dtype: int64

In [4]:
# To Change the 'Exercise','Heart_Disease','Skin_Cancer','Other_Cancer','Depression','Diabetes','Arthritis' Columns 
#into numeric Values
df = df.replace({
    'Exercise': {'Yes': 1, 'No': 0},
    'Heart_Disease': {'Yes': 1, 'No': 0},
    'Skin_Cancer': {'Yes': 1, 'No': 0},
    'Other_Cancer': {'Yes': 1, 'No': 0},
    'Depression': {'Yes': 1, 'No': 0},
    'Diabetes': {'Yes': 1, 'No': 0,'No, pre-diabetes or borderline diabetes': 2,
                 'Yes, but female told only during pregnancy' : 3},
    'Arthritis': {'Yes': 1, 'No': 0},
    'Sex':{'Female' :1, 'Male' : 0},
    'Smoking_History' : {'Yes': 1, 'No': 0}
})

In [5]:
# Convert the 'General_Health' column to numerical values
# Define the mapping from categorical values to numerical values
category_mapping = {'Excellent': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1, 'Poor': 0}

# Replace the values in the 'General_Health' column using the mapping
df['General_Health'] = df['General_Health'].replace(category_mapping)

In [6]:
# Convert the 'Checkup' column to numerical values
# Define the mapping from categorical values to numerical values
category1_mapping = {'Within the past year': 4, 'Within the past 2 years': 3, 'Within the past 5 years': 2, 
                    '5 or more years ago': 1, 'Never': 0}

# Replace the values in the 'General_Health' column using the mapping
df['Checkup'] = df['Checkup'].replace(category1_mapping)

In [7]:
# Convert the 'Age_Category' column to numerical values
def age_category(age):
    if age in ['18-24']:
        return 'Young'
    if age in ['25-29', '30-34', '35-39']:
        return 'Adult'
    elif age in ['40-44', '45-49', '50-54']:
        return 'Mid-Aged'
    elif age in ['55-59', '60-64','65-69']:
        return 'Senior-Adult'
    elif age in ['70-74', '75-79','80+']:
        return 'Elderly'
df['Age_Range'] = df['Age_Category'].apply(age_category)

age_mapping = {
    'Young': 0,
    'Adult': 1,
    'Mid-Aged': 2,
    'Senior-Adult': 3,
    'Elderly': 4
}

df['Age_Group'] = df['Age_Range'].map(age_mapping)

df.tail(25)

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,...,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Age_Range,Age_Group
308829,4,1,1,0,0,0,0,0,0,0,...,183.0,99.79,29.84,0,0.0,8.0,12.0,4.0,Mid-Aged,2
308830,3,2,1,0,0,0,0,0,0,1,...,168.0,70.31,25.02,1,8.0,60.0,2.0,2.0,Senior-Adult,3
308831,3,4,0,0,0,0,0,0,0,1,...,165.0,68.04,24.96,0,1.0,8.0,2.0,20.0,Adult,1
308832,1,4,0,0,0,0,0,1,1,1,...,160.0,65.77,25.69,0,0.0,60.0,30.0,0.0,Elderly,4
308833,3,4,1,0,0,0,0,3,0,1,...,163.0,72.57,27.46,0,0.0,12.0,16.0,3.0,Mid-Aged,2
308834,3,4,1,0,1,0,0,0,0,0,...,180.0,90.72,27.89,1,6.0,30.0,30.0,0.0,Mid-Aged,2
308835,4,4,1,0,0,0,0,0,0,0,...,178.0,72.57,22.96,0,3.0,10.0,20.0,5.0,Mid-Aged,2
308836,1,4,1,0,0,0,0,1,1,1,...,165.0,89.81,32.95,0,0.0,8.0,12.0,4.0,Elderly,4
308837,4,4,1,0,0,1,0,0,0,1,...,163.0,72.57,27.46,0,0.0,8.0,0.0,1.0,Mid-Aged,2
308838,4,4,1,0,0,0,0,0,0,1,...,163.0,55.79,21.11,0,0.0,30.0,30.0,3.0,Adult,1


In [8]:
# Drop the 'Age_Range' column, since we serive the new column 'Age_Group'

df = df.drop(columns=['Age_Category', 'Age_Range','Height_(cm)','Weight_(kg)'], axis=1)

df.tail(25)

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Age_Group
308829,4,1,1,0,0,0,0,0,0,0,29.84,0,0.0,8.0,12.0,4.0,2
308830,3,2,1,0,0,0,0,0,0,1,25.02,1,8.0,60.0,2.0,2.0,3
308831,3,4,0,0,0,0,0,0,0,1,24.96,0,1.0,8.0,2.0,20.0,1
308832,1,4,0,0,0,0,0,1,1,1,25.69,0,0.0,60.0,30.0,0.0,4
308833,3,4,1,0,0,0,0,3,0,1,27.46,0,0.0,12.0,16.0,3.0,2
308834,3,4,1,0,1,0,0,0,0,0,27.89,1,6.0,30.0,30.0,0.0,2
308835,4,4,1,0,0,0,0,0,0,0,22.96,0,3.0,10.0,20.0,5.0,2
308836,1,4,1,0,0,0,0,1,1,1,32.95,0,0.0,8.0,12.0,4.0,4
308837,4,4,1,0,0,1,0,0,0,1,27.46,0,0.0,8.0,0.0,1.0,2
308838,4,4,1,0,0,0,0,0,0,1,21.11,0,0.0,30.0,30.0,3.0,1


In [10]:
df.to_csv('Resources/heart_cleaned.csv', index = False)