In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [15]:
data_original = pd.read_csv('heart_disease_health_indicators_BRFSS2015.csv')

In [16]:
data_original

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,0.0,1.0,1.0,1.0,18.0,0.0,0.0,2.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


### Data Value Clarifications

In [17]:
# change diabetes == 1 to 0 and diabetes == 2 to 1
data_modified = data_original.copy()
data_modified['Diabetes'] = data_modified['Diabetes'].replace(1, 0)
data_modified['Diabetes'] = data_modified['Diabetes'].replace(2, 1)

In [18]:
# change age ordinal values to age ranges
age_map = {1: '18-24', 2: '25-29', 3: '30-34', 4: '35-39', 5: '40-44', 6: '45-49', 7: '50-54', 8: '55-59', 9: '60-64', 10: '65-69', 11: '70-74', 12: '75-79', 13: '80+'}
data_modified['Age'] = data_modified['Age'].map(age_map)

In [19]:
# change education ordinal values to descriptions
education_map = {1: 'No school or ony kindergarten', 2: 'Elementary', 3: 'Some high school', 4: 'High school graduate', 5: 'Some college or technical school', 6: 'College graduate'}
data_modified['Education'] = data_modified['Education'].map(education_map)

In [20]:
# change income ordinal values to income ranges
income_map = {1: '$0-$10,000', 2: '$10,000-$15,000', 3: '$15,000-$20,000', 4: '$20,000-$25,000', 5: '$25,000-$35,000', 6: '$35,000-$50,000', 7: '$50,000-$75,000', 8: '$75,000+'}
data_modified['Income'] = data_modified['Income'].map(income_map)

In [21]:
# make new column for all 3 response variables 
data_modified['Response'] = data_modified['HeartDiseaseorAttack'].astype(int).astype(str) + data_modified['Stroke'].astype(int).astype(str) + data_modified['Diabetes'].astype(int).astype(str)

In [22]:
# change sex 0, 1 to female, male
sex_map = {0: 'Female', 1: 'Male'}
data_modified['Sex'] = data_modified['Sex'].map(sex_map)

In [23]:
data_response = data_modified.drop(columns=['HeartDiseaseorAttack', 'Stroke', 'Diabetes'])
data_response.to_csv('STA160_Midterm_Data_Processed.csv', index=False)

Response Variables
* Response: 3 digit binary str, HeartDisease | Stroke | Diabetes
* HeartDiseaseorAttack: 0 -> No, 1 -> Yes
* Stroke: 0 -> No, 1 -> Yes
* Diabetes: 0 -> No, 1 -> Yes

Explanatory Variables
* HighBP: 0 -> No, 1 -> Yes
* HighChol: 0 -> No, 1 -> Yes
* CholCheck: 0 -> Chol not checked in past year, 1 -> Chol checked in past year
* BMI: body mass index
* Smoker: 0 -> No, 1 -> Yes
* PhysActivity: 0 -> No, 1 -> Yes
* Fruits: 0 -> No fruits, 1 -> At least 1 piece of fruit each day
* Veggies: 0 -> No veggies, 1 -> At least 1 piece of veg each day
* HvyAlcoholConsump: 0 -> No, 1 -> Yes
* AnyHealthcare: 0 -> No, 1 -> Yes
* NoDocbcCost: could not see doc bc of cost in past year: 0 -> No, 1 -> Yes
* GenHlth: 1 -> Excellent, ... , 5 -> Poor
* MentHlth: 0-30 -> Number of bad mental health days
* PhysHlth: 0-30 -> Number of bad physical health days
* DiffWalk: 0 -> No, 1 -> Yes
* Sex: Female, Male
* Age: 13 age ranges
* Education: 6 levels of education
* Income: 8 income ranges