In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
url = "https://raw.githubusercontent.com/Chasindu/Cardiovascular-Diseases-Risk-Prediction/main/CVD_cleaned.csv"
df = pd.read_csv(url, sep=',')

# backup_dataset
df_backup = df

In [37]:
# reset
df = df_backup

In [38]:
# df = df_backup.sample(n=200, random_state=42)
#dfAge = df.pop('Age_Category')

In [39]:
# Remove duplicated
df.drop_duplicates(inplace=True)

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
#let's quickly encode our target to check for correlations using a label encoder.
from sklearn.preprocessing import LabelEncoder

#create the cat and num columns
num_cols = df.select_dtypes(include='number').columns.to_list()
cat_cols = df.select_dtypes(exclude='number').columns.to_list()

#exclude the target from numerical columns
#cat_cols.remove("Age_Category")

#create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
#label_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), LabelEncoder())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(dtype=np.float64)) 

#use ColumnTransformer to set the estimators and transformations

preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)],
                                    remainder='passthrough'
                                 )
#                                   ('cat', cat_pipeline, ['General_Health', 'Checkup', 'Diabetes'])],

preprocessing

In [41]:
# Apply the preprocessing pipeline on the dataset
df_prepared = preprocessing.fit_transform(df)

# Scikit-learn strips the column headers in most cases, so just add them back on afterward.
feature_names=preprocessing.get_feature_names_out()
df_prepared = pd.DataFrame(data=df_prepared, columns=feature_names)
#df_prepared.rename(columns = { "num__General_Health": "General_Health", "num__Checkup": "Checkup", "num__Exercise": "Exercise",})
#df.columns = df.columns.str.replace('num|cat', '')

df = df_prepared

# Dropping the binary feature columns for 'no' - created after OneHotEncoding 

df.drop(labels=['cat__Exercise_No', 'cat__Smoking_History_No', 'cat__Skin_Cancer_No', 'cat__Other_Cancer_No', 'cat__Depression_No', 'cat__Arthritis_No', 'cat__Heart_Disease_No'], axis=1, inplace=True)
df.head()

Unnamed: 0,num__Height_(cm),num__Weight_(kg),num__BMI,num__Alcohol_Consumption,num__Fruit_Consumption,num__Green_Vegetables_Consumption,num__FriedPotato_Consumption,cat__General_Health_Excellent,cat__General_Health_Fair,cat__General_Health_Good,...,cat__Age_Category_40-44,cat__Age_Category_45-49,cat__Age_Category_50-54,cat__Age_Category_55-59,cat__Age_Category_60-64,cat__Age_Category_65-69,cat__Age_Category_70-74,cat__Age_Category_75-79,cat__Age_Category_80+,cat__Smoking_History_Yes
0,-1.93417,-2.386099,-2.159627,-0.621621,0.006661,0.059656,0.664362,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.526833,-0.303608,-0.051636,-0.621621,0.006661,-1.012235,-0.267624,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.714478,0.227673,0.742501,-0.133842,-0.716876,-0.811255,1.130355,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.880503,0.461456,0.015819,-0.621621,0.006661,0.997561,0.198369,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.912549,0.227673,-0.652605,-0.621621,-0.877663,-0.744262,-0.733617,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [43]:
df_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308774 entries, 0 to 308773
Data columns (total 43 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   num__Height_(cm)                                          308774 non-null  float64
 1   num__Weight_(kg)                                          308774 non-null  float64
 2   num__BMI                                                  308774 non-null  float64
 3   num__Alcohol_Consumption                                  308774 non-null  float64
 4   num__Fruit_Consumption                                    308774 non-null  float64
 5   num__Green_Vegetables_Consumption                         308774 non-null  float64
 6   num__FriedPotato_Consumption                              308774 non-null  float64
 7   cat__General_Health_Excellent                             308774 non-null  float64
 8   cat_

In [44]:
# from sklearn.preprocessing import LabelEncoder

# encode = LabelEncoder()

# df.remainder__Age_Category = encode.fit_transform(df_prepared.remainder__Age_Category)

In [45]:
df.columns = [col.split('__', 1)[-1] for col in df.columns]
df.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health_Excellent,General_Health_Fair,General_Health_Good,...,Age_Category_40-44,Age_Category_45-49,Age_Category_50-54,Age_Category_55-59,Age_Category_60-64,Age_Category_65-69,Age_Category_70-74,Age_Category_75-79,Age_Category_80+,Smoking_History_Yes
0,-1.93417,-2.386099,-2.159627,-0.621621,0.006661,0.059656,0.664362,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.526833,-0.303608,-0.051636,-0.621621,0.006661,-1.012235,-0.267624,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.714478,0.227673,0.742501,-0.133842,-0.716876,-0.811255,1.130355,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.880503,0.461456,0.015819,-0.621621,0.006661,0.997561,0.198369,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.912549,0.227673,-0.652605,-0.621621,-0.877663,-0.744262,-0.733617,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308774 entries, 0 to 308773
Data columns (total 43 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   Height_(cm)                                          308774 non-null  float64
 1   Weight_(kg)                                          308774 non-null  float64
 2   BMI                                                  308774 non-null  float64
 3   Alcohol_Consumption                                  308774 non-null  float64
 4   Fruit_Consumption                                    308774 non-null  float64
 5   Green_Vegetables_Consumption                         308774 non-null  float64
 6   FriedPotato_Consumption                              308774 non-null  float64
 7   General_Health_Excellent                             308774 non-null  float64
 8   General_Health_Fair                                  3

In [47]:
# split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)

In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277896 entries, 107627 to 128037
Data columns (total 43 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   Height_(cm)                                          277896 non-null  float64
 1   Weight_(kg)                                          277896 non-null  float64
 2   BMI                                                  277896 non-null  float64
 3   Alcohol_Consumption                                  277896 non-null  float64
 4   Fruit_Consumption                                    277896 non-null  float64
 5   Green_Vegetables_Consumption                         277896 non-null  float64
 6   FriedPotato_Consumption                              277896 non-null  float64
 7   General_Health_Excellent                             277896 non-null  float64
 8   General_Health_Fair                                  2

In [49]:
#df.rename(columns={'Heart_Disease_Yes': 'Heart_Disease'}, inplace=True)

df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30878 entries, 145860 to 86954
Data columns (total 43 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Height_(cm)                                          30878 non-null  float64
 1   Weight_(kg)                                          30878 non-null  float64
 2   BMI                                                  30878 non-null  float64
 3   Alcohol_Consumption                                  30878 non-null  float64
 4   Fruit_Consumption                                    30878 non-null  float64
 5   Green_Vegetables_Consumption                         30878 non-null  float64
 6   FriedPotato_Consumption                              30878 non-null  float64
 7   General_Health_Excellent                             30878 non-null  float64
 8   General_Health_Fair                                  30878 non-nul

In [50]:
df_heart_disease_0 = df_train[df_train["Heart_Disease_Yes"] == 0]
df_heart_disease_1 = df_train[df_train["Heart_Disease_Yes"] == 1]

#min: num of 0s, num of 1s
samples_per_category = min(len(df_heart_disease_0), len(df_heart_disease_1))

sample_heart_disease_0 = df_heart_disease_0.sample(n=samples_per_category, random_state=1)
sample_heart_disease_1 = df_heart_disease_1.sample(n=samples_per_category, random_state=1)

balanced_sample = pd.concat([sample_heart_disease_0, sample_heart_disease_1])
balanced_sample = balanced_sample.sample(frac=1, random_state=1)

df5050 = balanced_sample

df5050.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44886 entries, 197793 to 90428
Data columns (total 43 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Height_(cm)                                          44886 non-null  float64
 1   Weight_(kg)                                          44886 non-null  float64
 2   BMI                                                  44886 non-null  float64
 3   Alcohol_Consumption                                  44886 non-null  float64
 4   Fruit_Consumption                                    44886 non-null  float64
 5   Green_Vegetables_Consumption                         44886 non-null  float64
 6   FriedPotato_Consumption                              44886 non-null  float64
 7   General_Health_Excellent                             44886 non-null  float64
 8   General_Health_Fair                                  44886 non-nul

In [51]:

train_counts = df5050['Heart_Disease_Yes'].value_counts()
test_counts = df_test['Heart_Disease_Yes'].value_counts()

print("df5050:")
print(train_counts)

print("\ndf_test:")
print(test_counts)

df5050:
Heart_Disease_Yes
1.0    22443
0.0    22443
Name: count, dtype: int64

df_test:
Heart_Disease_Yes
0.0    28350
1.0     2528
Name: count, dtype: int64


In [52]:
X_train, y_train = df5050.drop(columns=['Heart_Disease_Yes']), df5050['Heart_Disease_Yes']
X_test, y_test = df_test.drop(columns=['Heart_Disease_Yes']), df_test['Heart_Disease_Yes']


model = LogisticRegression(max_iter=2000, random_state=1)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# confusion matrix
cm = confusion_matrix(y_test, y_pred)


TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]


sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Sensitivity: 0.78125
Specificity: 0.7341798941798942
