**Import Libraries**

In [17]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import train_test_split.
from sklearn.model_selection import train_test_split

# Import Logistic Regression model.
from sklearn.linear_model import LogisticRegression

# Import metrics.
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, RocCurveDisplay, roc_auc_score, recall_score, precision_score, f1_score
from sklearn import metrics

**Data Reading**

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
df = pd.read_csv('/content/drive/MyDrive/Capstone Project/heart_2022_with_nans.csv')

In [20]:
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [21]:
df.shape

(445132, 40)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer              44

In [23]:
df.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,434205.0,436065.0,439679.0,416480.0,403054.0,396326.0
mean,4.347919,4.382649,7.022983,1.702691,83.07447,28.529842
std,8.688912,8.387475,1.502425,0.107177,21.448173,6.554889
min,0.0,0.0,1.0,0.91,22.68,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.13
50%,0.0,0.0,7.0,1.7,80.74,27.44
75%,3.0,5.0,8.0,1.78,95.25,31.75
max,30.0,30.0,24.0,2.41,292.57,99.64


**Data Cleaning**

In [24]:
# check duplicate values
df.duplicated().sum()

157

In [25]:
# drop duplicated values
df.drop_duplicates(keep='first', inplace=True)
df.shape

(444975, 40)

In [26]:
# check for null values
df.isnull().sum()

State                            0
Sex                              0
GeneralHealth                 1193
PhysicalHealthDays           10922
MentalHealthDays              9062
LastCheckupTime               8301
PhysicalActivities            1088
SleepHours                    5448
RemovedTeeth                 11355
HadHeartAttack                3060
HadAngina                     4400
HadStroke                     1552
HadAsthma                     1768
HadSkinCancer                 3138
HadCOPD                       2214
HadDepressiveDisorder         2807
HadKidneyDisease              1921
HadArthritis                  2628
HadDiabetes                   1082
DeafOrHardOfHearing          20502
BlindOrVisionDifficulty      21419
DifficultyConcentrating      24095
DifficultyWalking            23867
DifficultyDressingBathing    23770
DifficultyErrands            25511
SmokerStatus                 35316
ECigaretteUsage              35514
ChestScan                    55900
RaceEthnicityCategor

In [27]:
# drop unnecessary columns
columns = ['State','DeafOrHardOfHearing','LastCheckupTime', 'RemovedTeeth', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking',
           'DifficultyDressingBathing', 'DifficultyErrands', 'ECigaretteUsage', 'ChestScan', 'HIVTesting', 'FluVaxLast12', 'TetanusLast10Tdap']

df.drop(columns, axis=1, inplace=True)
df.head()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,HadStroke,HadAsthma,...,SmokerStatus,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,PneumoVaxEver,HighRiskLastYear,CovidPos
0,Female,Very good,0.0,0.0,No,8.0,No,No,No,No,...,Never smoked,"White only, Non-Hispanic",Age 80 or older,,,,No,No,No,No
1,Female,Excellent,0.0,0.0,No,6.0,No,No,No,No,...,Never smoked,"White only, Non-Hispanic",Age 80 or older,1.6,68.04,26.57,No,No,No,No
2,Female,Very good,2.0,3.0,Yes,5.0,No,No,No,No,...,Never smoked,"White only, Non-Hispanic",Age 55 to 59,1.57,63.5,25.61,No,No,No,Yes
3,Female,Excellent,0.0,0.0,Yes,7.0,No,No,No,Yes,...,Current smoker - now smokes some days,"White only, Non-Hispanic",,1.65,63.5,23.3,No,Yes,No,No
4,Female,Fair,2.0,0.0,Yes,9.0,No,No,No,No,...,Never smoked,"White only, Non-Hispanic",Age 40 to 44,1.57,53.98,21.77,Yes,Yes,No,No
