In [1]:
# Import the dependencies
import os
import pandas as pd
from pathlib import Path

In [2]:
# Import data files
heart_attack = pd.read_csv(Path('./Resources/heart.csv'))
heart_disease = pd.read_csv(Path('./Resources/hd_by_state.csv'))
fast_food = pd.read_csv(Path('./Resources/FastFoodRestaurants.csv'))

In [3]:
# Find Heart Attack Data Types
heart_attack.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [4]:
# Find Heart Attack Column Names
heart_attack.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [5]:
# Rename Heart Attack Columns
heart_attack.rename(columns={'ChestPainType': 'Chest_Pain_Type', 'RestingBP': 'Resting_BP', 'FastingBS': 'Fasting_BS', 'RestingECG': 'Resting_ECG', 'MaxHR': 'Max_HR', 'ExerciseAngina': 'Exercise_Angina', 'Oldpeak': 'Old_Peak', 'HeartDisease': 'Heart_Disease'}, inplace=True)

In [6]:
# Check Heart Attack Columns have been renamed
heart_attack.columns

Index(['Age', 'Sex', 'Chest_Pain_Type', 'Resting_BP', 'Cholesterol',
       'Fasting_BS', 'Resting_ECG', 'Max_HR', 'Exercise_Angina', 'Old_Peak',
       'ST_Slope', 'Heart_Disease'],
      dtype='object')

In [7]:
# Check Heart Attack DataFrame
heart_attack.head()

Unnamed: 0,Age,Sex,Chest_Pain_Type,Resting_BP,Cholesterol,Fasting_BS,Resting_ECG,Max_HR,Exercise_Angina,Old_Peak,ST_Slope,Heart_Disease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
# Check Heart Disease Data Types
heart_disease.dtypes

YEAR        int64
STATE      object
RATE      float64
DEATHS     object
URL        object
dtype: object

In [9]:
# Check Heart Disease Column Names
heart_disease.columns

Index(['YEAR', 'STATE', 'RATE', 'DEATHS', 'URL'], dtype='object')

In [10]:
# Rename Heart Disease Columns
heart_disease.rename(columns={'YEAR' : 'Year', 'STATE' : 'US_State', 'RATE' : 'Rate_of_Heart_Disease', 'DEATHS' : 'Deaths'}, inplace=True)

In [11]:
# Check Heart Disease Column Names have been updates
heart_disease.columns

Index(['Year', 'US_State', 'Rate_of_Heart_Disease', 'Deaths', 'URL'], dtype='object')

In [12]:
# Drop URL column from DataFrame - it is unnecessary for our purposes
heart_disease.drop(columns=['URL'], axis=1, inplace=True)

In [13]:
heart_disease['Deaths'] = heart_disease['Deaths'].replace(",", "", regex=True)

In [14]:
heart_disease

Unnamed: 0,Year,US_State,Rate_of_Heart_Disease,Deaths
0,2020,AL,237.5,14739
1,2020,AK,139.8,915
2,2020,AZ,144.8,14196
3,2020,AR,222.5,8621
4,2020,CA,144.0,66538
...,...,...,...,...
395,2005,VA,203.0,14192
396,2005,WA,180.5,10985
397,2005,WV,253.6,5538
398,2005,WI,190.6,11842


In [15]:
# Check Fast Food Data Types
fast_food.dtypes

index           int64
address        object
city           object
country        object
keys           object
latitude      float64
longitude     float64
name           object
postalCode     object
province       object
websites       object
dtype: object

In [16]:
# Check names of Fast Food Columns
fast_food.columns

Index(['index', 'address', 'city', 'country', 'keys', 'latitude', 'longitude',
       'name', 'postalCode', 'province', 'websites'],
      dtype='object')

In [17]:
# Rename Fast Food Columns
fast_food.rename(columns={'index': 'Restaurant_Number', 'address': 'Address', 'city': 'City', 'latitude': 'Latitude', 'longitude': 'Longitude',
       'name': 'Name_of_Restaurant', 'postalCode': 'Zip_Code', 'province': 'US_State'}, inplace=True)

In [18]:
# Check names of Fast Food Columns have been updated
fast_food.columns

Index(['Restaurant_Number', 'Address', 'City', 'country', 'keys', 'Latitude',
       'Longitude', 'Name_of_Restaurant', 'Zip_Code', 'US_State', 'websites'],
      dtype='object')

In [19]:
# Check to see if there is more than one country in country column, otherwise, drop with other columns
fast_food.nunique()

Restaurant_Number     10000
Address                9934
City                   2775
country                   1
keys                  10000
Latitude               9935
Longitude              9956
Name_of_Restaurant      548
Zip_Code               5289
US_State                 52
websites               3821
dtype: int64

In [20]:
# Drop 'websites' and 'keys' columns - unnecessary information for our purposes
# Drop 'country' column as there is only one country (US), so it is unnecessary
fast_food.drop(columns=['websites', 'keys', 'country'], axis=1, inplace=True)

In [21]:
fast_food.head()

Unnamed: 0,Restaurant_Number,Address,City,Latitude,Longitude,Name_of_Restaurant,Zip_Code,US_State
0,0,324 Main St,Massena,44.9213,-74.89021,McDonald's,13662,NY
1,1,530 Clinton Ave,Washington Court House,39.53255,-83.44526,Wendy's,43160,OH
2,2,408 Market Square Dr,Maysville,38.62736,-83.79141,Frisch's Big Boy,41056,KY
3,3,6098 State Highway 37,Massena,44.95008,-74.84553,McDonald's,13662,NY
4,4,139 Columbus Rd,Athens,39.35155,-82.09728,OMG! Rotisserie,45701,OH


In [22]:
fast_food.to_csv(Path('./Resources/clean_fast_food.csv'), index=False)

In [23]:
heart_disease.to_csv(Path('./Resources/clean_heart_disease.csv'))

In [24]:
heart_attack.to_csv(Path('./Resources/clean_heart_attack.csv'))