# Import Libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#load the file
df = pd.read_csv("heart_2020.csv")

#Data Description

##Data Checking

In [None]:
#check the columns
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [None]:
#display information about the index dtype and columns, non-null values and memory usage.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  int64  
 6   MentalHealth      319795 non-null  int64  
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  int64  
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

As non-null is shown for all columns, there is no null values in the dataset

In [None]:
#get the first five rows to check each variable
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [None]:
#illustrate HeartDisease by using histogram
df["HeartDisease"].value_counts().plot(kind="pie").set_title("HeartDisease")
plt.show()

#check the proportion of "Yes" and "No" in HeartDisease 
Yes_Value = df['HeartDisease'].value_counts()[1]/len(df)*100
print(f"\nThe proportion of \"Yes\" Value is {'%.2f' %(Yes_Value)}% and the proportion of \"No\" value is {'%.2f' %(100-Yes_Value)}%")

The data of people who have heart disease is only 8.56% of the whole dataset, which showcases that the dataset is unbalanced.

## Data Visualization

In [None]:
def pie_chart(variable):
    fig,axes = plt.subplots(1,2,figsize=(20,8), dpi=80)
    labels = df[variable].unique()
    axes[0].pie(df[df.HeartDisease=="No"][variable].value_counts(), labels=labels, autopct='%1.1f%%')
    axes[0].set_title('Heart Disease[No]',fontsize=20)
    axes[1].pie(df[df.HeartDisease=="Yes"][variable].value_counts(), labels=labels, autopct='%1.1f%%')
    axes[1].set_title('Heart Disease[Yes]',fontsize=20)

    plt.legend(title = variable, fontsize=10, title_fontsize=10)
    plt.show()

**Binary columns**

`['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic',	'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']`

In [None]:
bi_var = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic',	'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer', 'Sex']
for var in bi_var:
  pie_chart(var)

**Categprical Columns**

`["Race", "GenHealth", "AgeCategory"]`

In [None]:
ca_var = ["Race", "GenHealth", "AgeCategory"]
for var in ca_var:
  pie_chart(var)

**Numerical Columns**

`["BMI", "SleepTime", "PhysicalHealth", "MentalHealth"]`

In [None]:
nu_var = ["BMI", "SleepTime", "PhysicalHealth", "MentalHealth"]
for var in nu_var:
  fig = plt.figure(figsize=(20,10))   
  sns.kdeplot(data=df, x=df[var], hue=df["HeartDisease"],fill=True)
  plt.show()

**Correlation Matrix**

In [None]:
fig = plt.figure(figsize=(20,10))
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

Determine how one variable moves/changes in relation with the other variable.

# Data Preprocessing

In [None]:
df.shape #(319795, 18)

#Remove duplicates
df.drop_duplicates()

df.shape  #(319795, 18), which shows no duplicates


#make categorical variables with "Yes/No" to numerical "1/0"  
columns = ['HeartDisease','Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic',	'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']

for column in columns:
  df = df.replace({column: {'Yes': 1, 'No': 0, 'Yes (during pregnancy)':1, 'No, borderline diabetes': 0}}) 

#Create dummy variables for Sex, AgeCategory, Race, and GenHealth
dummy_Sex = pd.get_dummies(df.Sex, prefix="Sex")
dummy_Race = pd.get_dummies(df.Race, prefix="Race")
dummy_GenHealth = pd.get_dummies(df.GenHealth, prefix="GenHealth")
dummy_AgeCategory = pd.get_dummies(df.AgeCategory, prefix="AgeCategory")

In [None]:
#load dataset
X = df[['BMI', 'Stroke','PhysicalHealth', 'MentalHealth', "Asthma",
        'DiffWalking', 'Diabetic', 'PhysicalActivity', 'SleepTime',
        'KidneyDisease', 'SkinCancer']]
       
y = df['HeartDisease']

#Add dummies into X
X = X.join(dummy_Sex)
X = X.join(dummy_AgeCategory) 
X = X.join(dummy_Race)
X = X.join(dummy_GenHealth)

#show the result 
X.info()

#split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

#standardize our independent variables
scaler = StandardScaler()
X_std = scaler.fit_transform(Xtrain)
Xtest_std = scaler.fit_transform(Xtest)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 37 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   BMI                                  319795 non-null  float64
 1   Stroke                               319795 non-null  int64  
 2   PhysicalHealth                       319795 non-null  int64  
 3   MentalHealth                         319795 non-null  int64  
 4   Asthma                               319795 non-null  int64  
 5   DiffWalking                          319795 non-null  int64  
 6   Diabetic                             319795 non-null  int64  
 7   PhysicalActivity                     319795 non-null  int64  
 8   SleepTime                            319795 non-null  int64  
 9   KidneyDisease                        319795 non-null  int64  
 10  SkinCancer                           319795 non-null  int64  
 11  Sex_Female   

# Logistic Regression

accuracy

In [None]:
lr = LogisticRegression(random_state=0)

#training
lr.fit(X_std, ytrain)

#testing
print(lr.score(Xtest_std, ytest))

0.9156964321079019


**Independent Variables Coefficient**

In [None]:
for z in range(len(X.columns)):
  print ('{0:40} {1:100}'.format(X.columns[z], str('%.8f'%(lr.coef_[0][z]))))

BMI                                      0.05401812                                                                                          
Stroke                                   0.20249526                                                                                          
PhysicalHealth                           0.02526520                                                                                          
MentalHealth                             0.04583937                                                                                          
Asthma                                   0.09736122                                                                                          
DiffWalking                              0.07724821                                                                                          
Diabetic                                 0.15085887                                                                                          
Physic

**Numerical Variables**

In [None]:
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center',
                 Bbox = dict(facecolor = 'red', alpha =.8))

data = {}
for z in range(11):
  data.update({X.columns[z]:round(lr.coef_[0][z],3)})
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (20, 15))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
addlabels(courses, values)
plt.ylabel("Coefficient Value")
plt.title("Coefficient of Binary & Numerical Variables")
plt.show()

**Sex**

In [None]:
data = {}
for z in range(11,13):
  data.update({X.columns[z]:round(lr.coef_[0][z],3)})
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (5, 5))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
addlabels(courses, values)
plt.ylabel("Coefficient Value")
plt.title("Coefficient of Sex")
plt.show()

In [None]:
data = {}
for z in range(13,26):
  data.update({X.columns[z]:round(lr.coef_[0][z],3)})
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (30, 15))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
addlabels(courses, values)
plt.ylabel("Coefficient Value")
plt.title("Coefficient of AgeCategory")
plt.show()

In [None]:
data = {}
for z in range(27,32):
  data.update({X.columns[z]:round(lr.coef_[0][z],3)})
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (20, 15))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
addlabels(courses, values)
plt.ylabel("Coefficient Value")
plt.title("Coefficient of Race")
plt.show()

In [None]:
data = {}
for z in range(33,37):
  data.update({X.columns[z]:round(lr.coef_[0][z],3)})
courses = list(data.keys())
values = list(data.values())
  
fig = plt.figure(figsize = (20, 15))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
addlabels(courses, values)
plt.ylabel("Coefficient Value")
plt.title("Coefficient of GenHealth")
plt.show()

# Testing

In [None]:
Asian_Male = [[25.7, 0, 23, 25, 
                  0, 0, 0, 1, 8, 0, 0, 
                  0, 1, 
                  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                  0, 1, 0, 0, 0, 0, 
                  0, 0, 0, 0, 1]]

AmericanIndian_Male = [[25.7, 0, 23, 25, 
                  0, 0, 0, 1, 8, 0, 0, 
                  0, 1, 
                  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                  1, 0, 0, 0, 0, 0, 
                  0, 0, 0, 0, 1]]

White_Male = [[33.1, 0, 15, 28, 
                 0, 1, 1, 1, 9, 0, 0, 
                 0, 1, 
                 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
                 0, 0, 0, 0, 0, 1, 
                 0, 0, 0, 1, 0]]

White_Female = [[33.1, 0, 15, 28, 
                          0, 1, 1, 1, 9, 0, 0, 
                          1, 0, 
                          0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
                          0, 0, 0, 0, 0, 1, 
                          0, 0, 0, 1, 0]]

White_Female_GenHealth = [[33.1, 0, 15, 28, 
                 0, 1, 1, 1, 9, 0, 0, 
                 1, 0, 
                 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
                 0, 0, 0, 0, 0, 1, 
                 0, 0, 1, 0, 0]]

print(f"The Asian man has {('%.2f' %(lr.predict_proba(Asian_Male)[:,1]*100))}% chance to get heart disease")
print(f"The American Indian man has {('%.2f' %(lr.predict_proba(AmericanIndian_Male)[:,1]*100))}% chance to get heart disease")
print(f"The white man has {('%.2f' %(lr.predict_proba(White_Male)[:,1]*100))}% chance to get heart disease")
print(f"The white woman Phoenix has {('%.2f' %(lr.predict_proba(White_Female)[:,1]*100))}% chance to get heart disease")
print(f"The white woman Phoenix with good GenHealth has {('%.2f' %(lr.predict_proba(White_Female_GenHealth)[:,1]*100))}% chance to get heart disease")

The Asian man has 31.55% chance to get heart disease
The American Indian man has 33.50% chance to get heart disease
The white man has 65.51% chance to get heart disease
The white woman Phoenix has 56.47% chance to get heart disease
The white woman Phoenix with fair GenHealth has 54.64% chance to get heart disease
