In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.losses import BinaryCrossentropy
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [2]:
data= pd.read_csv(r"C:\Study Hub\Data Science_DEPI\Graduation_Project\Hypertension_project\Data Exploration&Cleaning\CleanedFinal.xls")
data.head()

Unnamed: 0,Country,Age,BMI,Cholesterol,Systolic_BP,Diastolic_BP,Smoking_Status,Alcohol_Intake,Physical_Activity_Level,Family_History,...,Sleep_Duration,Heart_Rate,LDL,HDL,Triglycerides,Glucose,Gender,Education_Level,Employment_Status,Hypertension
0,UK,58,29.5,230,160,79,Never,27.9,Low,Yes,...,6.1,80,100,75,72,179,Female,Primary,Unemployed,High
1,Spain,34,36.2,201,120,84,Never,27.5,High,Yes,...,9.8,56,77,47,90,113,Male,Secondary,Unemployed,High
2,Indonesia,73,18.2,173,156,60,Current,1.8,High,Yes,...,5.2,75,162,56,81,101,Male,Primary,Employed,Low
3,Canada,60,20.3,183,122,94,Never,11.6,Moderate,Yes,...,7.5,71,164,93,94,199,Female,Secondary,Retired,High
4,France,73,21.8,296,91,97,Never,29.1,Moderate,Yes,...,5.0,52,108,74,226,157,Female,Primary,Employed,High


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174982 entries, 0 to 174981
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Country                  174982 non-null  object 
 1   Age                      174982 non-null  int64  
 2   BMI                      174982 non-null  float64
 3   Cholesterol              174982 non-null  int64  
 4   Systolic_BP              174982 non-null  int64  
 5   Diastolic_BP             174982 non-null  int64  
 6   Smoking_Status           174982 non-null  object 
 7   Alcohol_Intake           174982 non-null  float64
 8   Physical_Activity_Level  174982 non-null  object 
 9   Family_History           174982 non-null  object 
 10  Diabetes                 174982 non-null  object 
 11  Stress_Level             174982 non-null  int64  
 12  Salt_Intake              174982 non-null  float64
 13  Sleep_Duration           174982 non-null  float64
 14  Hear

In [4]:
data.shape

(174982, 23)

## Data Preprocessing

In [5]:
data.drop(columns= 'Country', inplace= True)

In [6]:
data.shape

(174982, 22)

In [7]:
## Encoding Categorical Variables
label_colu= ['Education_Level','Employment_Status', 'Smoking_Status', 'Physical_Activity_Level']
label = LabelEncoder()

In [8]:
for col in label_colu:
    data[col]= label.fit_transform(data[col])

In [9]:
one_hot_col= ['Hypertension','Gender','Diabetes','Family_History']
data= pd.get_dummies(data, columns= one_hot_col )

In [10]:
data.head()

Unnamed: 0,Age,BMI,Cholesterol,Systolic_BP,Diastolic_BP,Smoking_Status,Alcohol_Intake,Physical_Activity_Level,Stress_Level,Salt_Intake,...,Education_Level,Employment_Status,Hypertension_High,Hypertension_Low,Gender_Female,Gender_Male,Diabetes_No,Diabetes_Yes,Family_History_No,Family_History_Yes
0,58,29.5,230,160,79,2,27.9,1,9,14.7,...,0,2,True,False,True,False,False,True,False,True
1,34,36.2,201,120,84,2,27.5,0,6,10.8,...,1,2,True,False,False,True,False,True,False,True
2,73,18.2,173,156,60,0,1.8,0,5,6.5,...,0,0,False,True,False,True,False,True,False,True
3,60,20.3,183,122,94,2,11.6,2,6,4.0,...,1,1,True,False,True,False,False,True,False,True
4,73,21.8,296,91,97,2,29.1,2,6,8.4,...,0,0,True,False,True,False,True,False,False,True


In [11]:
bool_cols = data.select_dtypes(include='bool').columns
data[bool_cols] = data[bool_cols].astype(int)

In [12]:
data['Hypertension'] = data['Hypertension_High'].astype(int)

In [13]:
data = data.drop(["Hypertension_High", "Hypertension_Low"], axis=1)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174982 entries, 0 to 174981
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Age                      174982 non-null  int64  
 1   BMI                      174982 non-null  float64
 2   Cholesterol              174982 non-null  int64  
 3   Systolic_BP              174982 non-null  int64  
 4   Diastolic_BP             174982 non-null  int64  
 5   Smoking_Status           174982 non-null  int64  
 6   Alcohol_Intake           174982 non-null  float64
 7   Physical_Activity_Level  174982 non-null  int64  
 8   Stress_Level             174982 non-null  int64  
 9   Salt_Intake              174982 non-null  float64
 10  Sleep_Duration           174982 non-null  float64
 11  Heart_Rate               174982 non-null  int64  
 12  LDL                      174982 non-null  int64  
 13  HDL                      174982 non-null  int64  
 14  Trig

In [15]:
## Standardization
cols_to_scale= ['Age', 'BMI', 'Cholesterol', 'Systolic_BP', 'Diastolic_BP', 'Alcohol_Intake', 'Stress_Level', 'Sleep_Duration',
               'Salt_Intake', 'Heart_Rate', 'LDL', 'HDL', 'Triglycerides', 'Glucose']
scalar= StandardScaler()

In [16]:
data[cols_to_scale]= scalar.fit_transform(data[cols_to_scale])

In [17]:
data.head()

Unnamed: 0,Age,BMI,Cholesterol,Systolic_BP,Diastolic_BP,Smoking_Status,Alcohol_Intake,Physical_Activity_Level,Stress_Level,Salt_Intake,...,Glucose,Education_Level,Employment_Status,Gender_Female,Gender_Male,Diabetes_No,Diabetes_Yes,Family_History_No,Family_History_Yes,Hypertension
0,0.21681,0.277983,0.125934,0.979845,-0.60369,2,1.491914,1,1.543936,1.657904,...,1.186135,0,2,1,0,0,1,0,1,1
1,-0.937797,1.206753,-0.542863,-0.557482,-0.314852,2,1.445685,0,0.382196,0.617921,...,-0.572609,1,2,0,1,0,1,0,1,1
2,0.93844,-1.288449,-1.188598,0.826112,-1.701274,0,-1.524521,0,-0.00505,-0.528726,...,-0.892381,0,0,0,1,0,1,0,1,0
3,0.313027,-0.997342,-0.957978,-0.480616,0.262824,2,-0.391913,2,0.382196,-1.195382,...,1.719088,1,1,1,0,0,1,0,1,1
4,0.93844,-0.789409,1.648022,-1.672044,0.436127,2,1.6306,2,0.382196,-0.022068,...,0.599887,0,0,1,0,1,0,0,1,1


In [18]:
# Logistic model

In [19]:
x= data.drop(columns='Hypertension', axis= 1)
y= data['Hypertension']

In [20]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.2, random_state= 42)

In [21]:
x_train.columns

Index(['Age', 'BMI', 'Cholesterol', 'Systolic_BP', 'Diastolic_BP',
       'Smoking_Status', 'Alcohol_Intake', 'Physical_Activity_Level',
       'Stress_Level', 'Salt_Intake', 'Sleep_Duration', 'Heart_Rate', 'LDL',
       'HDL', 'Triglycerides', 'Glucose', 'Education_Level',
       'Employment_Status', 'Gender_Female', 'Gender_Male', 'Diabetes_No',
       'Diabetes_Yes', 'Family_History_No', 'Family_History_Yes'],
      dtype='object')

In [22]:
model= LogisticRegression()

In [23]:
model.fit(x_train, y_train)

In [24]:
y_pred= model.predict(x_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      9889
           1       0.72      1.00      0.84     25108

    accuracy                           0.72     34997
   macro avg       0.36      0.50      0.42     34997
weighted avg       0.51      0.72      0.60     34997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
data['Hypertension'].value_counts()

Hypertension
1    125781
0     49201
Name: count, dtype: int64

In [27]:
from sklearn.utils import resample

In [28]:
class_1 = data[data['Hypertension'] == 1]
class_0 = data[data['Hypertension'] == 0]

In [29]:
class_1_down = resample(
    class_1,
    replace=False,            
    n_samples=len(class_0),  
    random_state=42
)

In [30]:
data_balanced = pd.concat([class_0, class_1_down]).sample(frac=1, random_state=42)

In [31]:
print(data_balanced['Hypertension'].value_counts())

Hypertension
0    49201
1    49201
Name: count, dtype: int64


In [32]:
## Split data
X= data_balanced.drop(columns= 'Hypertension', axis= 1)
y= data_balanced['Hypertension']

In [33]:
x_train, x_test, y_train, y_test= train_test_split(X,y, test_size= 0.2, random_state= 42,stratify=y)

In [34]:
model= LogisticRegression()

In [35]:
model.fit(x_train, y_train)

In [36]:
y_pred = model.predict(x_test)

In [37]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.49      0.49      0.49      9841
           1       0.49      0.49      0.49      9840

    accuracy                           0.49     19681
   macro avg       0.49      0.49      0.49     19681
weighted avg       0.49      0.49      0.49     19681



In [38]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [39]:
y_pred = model.predict(x_test)

In [40]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.50      0.53      0.52      9841
           1       0.50      0.47      0.48      9840

    accuracy                           0.50     19681
   macro avg       0.50      0.50      0.50     19681
weighted avg       0.50      0.50      0.50     19681



In [41]:
## Neural Network

In [42]:
X = data.drop('Hypertension', axis=1)
y = data['Hypertension']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [43]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\boody\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\boody\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\boody\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [44]:
y_train_res = np.array(y_train_res).reshape(-1, 1)
X_train_res = np.array(X_train_res, dtype=np.float32)

In [45]:
model= Sequential([ keras.layers.Dense(units= 256, activation= 'relu'),
                  keras.layers.Dense(units= 128, activation= 'relu'),
                  keras.layers.Dense(units= 64, activation= 'relu'),
                  keras.layers.Dense(units=32, activation= 'relu'),
                  keras.layers.Dense(units= 1, activation='linear')])

In [46]:
model.compile(optimizer= 'adam', loss= BinaryCrossentropy(from_logits= True), metrics= ['accuracy'])

In [47]:
model.fit(X_train_res, y_train_res, epochs= 50,validation_split=0.2 )

Epoch 1/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 5ms/step - accuracy: 0.5940 - loss: 0.6329 - val_accuracy: 0.4445 - val_loss: 0.6535
Epoch 2/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.6406 - loss: 0.6075 - val_accuracy: 0.5414 - val_loss: 0.6260
Epoch 3/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.6393 - loss: 0.6057 - val_accuracy: 0.6446 - val_loss: 0.5968
Epoch 4/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.6381 - loss: 0.6043 - val_accuracy: 0.5686 - val_loss: 0.6218
Epoch 5/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.6452 - loss: 0.6027 - val_accuracy: 0.6926 - val_loss: 0.5788
Epoch 6/50
[1m5032/5032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5ms/step - accuracy: 0.6454 - loss: 0.6015 - val_accuracy: 0.7209 - val_loss: 0.5702
Epoch 7/50

<keras.src.callbacks.history.History at 0x1d898914d70>