In [18]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
df = pd.read_csv('CVD_cleaned.csv')
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [20]:
# encoding
label_encoder = LabelEncoder()

columns_to_encode = ["Exercise", "Heart_Disease", 
                     "Skin_Cancer", "Other_Cancer", "Depression", 
                     "Arthritis", "Sex", "Age_Category", "Smoking_History"]

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])
    

    
encoding_health = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4}
df['General_Health'] = df['General_Health'].map(encoding_health)

encoding_checkup = {'Never': 0, '5 or more years ago': 1, 'Within the past 5 years': 2, 'Within the past 2 years': 3, 'Within the past year': 4}
df['Checkup'] = df['Checkup'].map(encoding_checkup)


diabetes_data = df[['Diabetes']]

# oneHotEncoder
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(diabetes_data)
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Diabetes']))
df_encoded = pd.concat([df.drop(columns=['Diabetes']), encoded_df], axis=1)

df = df_encoded
    
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,...,BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Diabetes_No,"Diabetes_No, pre-diabetes or borderline diabetes",Diabetes_Yes,"Diabetes_Yes, but female told only during pregnancy"
0,0,3,0,0,0,0,0,1,0,10,...,14.54,1,0.0,30.0,16.0,12.0,1.0,0.0,0.0,0.0
1,3,4,0,1,0,0,0,0,0,10,...,28.29,0,0.0,30.0,0.0,4.0,0.0,0.0,1.0,0.0
2,3,4,1,0,0,0,0,0,0,8,...,33.47,0,4.0,12.0,3.0,16.0,0.0,0.0,1.0,0.0
3,0,4,1,1,0,0,0,0,1,11,...,28.73,0,0.0,30.0,30.0,8.0,0.0,0.0,1.0,0.0
4,2,4,0,0,0,0,0,0,1,12,...,24.37,1,0.0,8.0,4.0,0.0,1.0,0.0,0.0,0.0


In [21]:
# split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)

In [22]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277968 entries, 87866 to 128037
Data columns (total 22 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   General_Health                                       277968 non-null  int64  
 1   Checkup                                              277968 non-null  int64  
 2   Exercise                                             277968 non-null  int32  
 3   Heart_Disease                                        277968 non-null  int32  
 4   Skin_Cancer                                          277968 non-null  int32  
 5   Other_Cancer                                         277968 non-null  int32  
 6   Depression                                           277968 non-null  int32  
 7   Arthritis                                            277968 non-null  int32  
 8   Sex                                                  27

In [23]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30886 entries, 137 to 72411
Data columns (total 22 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   General_Health                                       30886 non-null  int64  
 1   Checkup                                              30886 non-null  int64  
 2   Exercise                                             30886 non-null  int32  
 3   Heart_Disease                                        30886 non-null  int32  
 4   Skin_Cancer                                          30886 non-null  int32  
 5   Other_Cancer                                         30886 non-null  int32  
 6   Depression                                           30886 non-null  int32  
 7   Arthritis                                            30886 non-null  int32  
 8   Sex                                                  30886 non-null  

In [24]:
df_heart_disease_0 = df_train[df_train["Heart_Disease"] == 0]
df_heart_disease_1 = df_train[df_train["Heart_Disease"] == 1]

#min: num of 0s, num of 1s
samples_per_category = min(len(df_heart_disease_0), len(df_heart_disease_1))

sample_heart_disease_0 = df_heart_disease_0.sample(n=samples_per_category, random_state=1)
sample_heart_disease_1 = df_heart_disease_1.sample(n=samples_per_category, random_state=1)

balanced_sample = pd.concat([sample_heart_disease_0, sample_heart_disease_1])
balanced_sample = balanced_sample.sample(frac=1, random_state=1)

df5050 = balanced_sample

df5050.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45024 entries, 79050 to 11981
Data columns (total 22 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   General_Health                                       45024 non-null  int64  
 1   Checkup                                              45024 non-null  int64  
 2   Exercise                                             45024 non-null  int32  
 3   Heart_Disease                                        45024 non-null  int32  
 4   Skin_Cancer                                          45024 non-null  int32  
 5   Other_Cancer                                         45024 non-null  int32  
 6   Depression                                           45024 non-null  int32  
 7   Arthritis                                            45024 non-null  int32  
 8   Sex                                                  45024 non-null

In [25]:

train_counts = df5050['Heart_Disease'].value_counts()
test_counts = df_test['Heart_Disease'].value_counts()

print("df5050:")
print(train_counts)

print("\ndf_test:")
print(test_counts)

df5050:
Heart_Disease
0    22512
1    22512
Name: count, dtype: int64

df_test:
Heart_Disease
0    28427
1     2459
Name: count, dtype: int64


In [26]:
X_train, y_train = df5050.drop(columns=['Heart_Disease']), df5050['Heart_Disease']
X_test, y_test = df_test.drop(columns=['Heart_Disease']), df_test['Heart_Disease']


model = LogisticRegression(max_iter=2000, random_state=0)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)


TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]


sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Sensitivity: 0.8003253355022367
Specificity: 0.7269145530657474
