In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer



# Load the preprocessed dataset
data = pd.read_csv("dataset/Hypertension_data.csv")

# Separate features and target variable
X = data.drop(columns=["prevalentHyp"])
y = data["prevalentHyp"]

In [2]:
data.dtypes

education          int64
age                int64
BMI              float64
currentSmoker      int64
prevalentHyp       int64
heartRate          int64
dtype: object

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   education      1007 non-null   int64  
 1   age            1007 non-null   int64  
 2   BMI            1007 non-null   float64
 3   currentSmoker  1007 non-null   int64  
 4   prevalentHyp   1007 non-null   int64  
 5   heartRate      1007 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 47.3 KB


In [4]:
data.shape

(1007, 6)

In [5]:
data.describe(include="all")

Unnamed: 0,education,age,BMI,currentSmoker,prevalentHyp,heartRate
count,1007.0,1007.0,1007.0,1007.0,1007.0,1007.0
mean,1.992056,49.471698,25.628818,1.497517,0.305859,76.056604
std,1.010844,8.771732,4.41317,0.500242,0.460999,12.002724
min,1.0,33.0,3.0,1.0,0.0,48.0
25%,1.0,42.0,22.885,1.0,0.0,68.0
50%,2.0,49.0,25.25,1.0,0.0,75.0
75%,3.0,57.0,27.98,2.0,1.0,83.5
max,4.0,68.0,45.8,2.0,1.0,140.0


In [8]:
# Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')  
data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']] = imputer.fit_transform(data[['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']])


# For categorical columns
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in categorical_cols:
    data[col].fillna('missing', inplace=True)  

# Convert object columns to numeric
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
label_encoder = LabelEncoder()
for col in binary_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Convert remaining object columns to numeric
data['pcv'] = pd.to_numeric(data['pcv'], errors='coerce')
data['wc'] = pd.to_numeric(data['wc'], errors='coerce')
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Feature Scaling
scaler = StandardScaler()
numerical_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Drop Unnecessary Columns
data.drop(['id'], axis=1, inplace=True)

print(data.head())

KeyError: "['bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo'] not in index"

In [28]:
# Convert 'rc' column to numeric
data['rc'] = pd.to_numeric(data['rc'], errors='coerce')

# Handling Missing Values for Numerical Columns
numerical_imputer = SimpleImputer(strategy='median')  
data[['pcv', 'wc', 'rc']] = numerical_imputer.fit_transform(data[['pcv', 'wc', 'rc']])

# Display the preprocessed data
print(data.head())


        age        bp        sg        al        su  rbc  pc  pcc  ba  \
0 -0.216167  0.254214  0.421486  0.076249 -0.380269    1   2    1   1   
1 -2.627830 -1.972476  0.421486  2.363728 -0.380269    1   2    1   1   
2  0.607327  0.254214 -1.421074  0.838742  2.507853    2   2    1   1   
3 -0.216167 -0.488016 -2.342354  2.363728 -0.380269    2   0    2   1   
4 -0.039704  0.254214 -1.421074  0.838742 -0.380269    2   2    1   1   

        bgr  ...       pcv        wc        rc  htn  dm  cad  appet  pe  ane  \
0 -0.283841  ...  0.569881 -0.206202  0.481295    2   5    2      0   1    1   
1 -0.572370  ... -0.098536 -0.818559  0.090447    1   4    2      0   1    1   
2  3.676881  ... -0.878356 -0.308261  0.090447    1   5    2      2   1    2   
3 -0.336301  ... -0.766953 -0.580420 -0.788961    2   4    2      2   2    2   
4 -0.480565  ... -0.432744 -0.376301 -0.104977    1   4    2      0   1    1   

   classification  
0             ckd  
1             ckd  
2             ckd  


In [7]:
# # Drop specified columns from features
# X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])

# # Separate the target variable
# y = data['Diabetes_012']

# # Now you can proceed with preprocessing the data, splitting it into training and testing sets, and further analysis.


In [2]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7079207920792079


In [3]:
# Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set using Decision Tree
dt_y_pred = dt_classifier.predict(X_test)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 0.7079207920792079
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       140
           1       0.53      0.37      0.44        62

    accuracy                           0.71       202
   macro avg       0.64      0.61      0.62       202
weighted avg       0.69      0.71      0.69       202


Decision Tree Classifier:
Accuracy: 0.6039603960396039
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.66      0.70       140
           1       0.38      0.48      0.43        62

    accuracy                           0.60       202
   macro avg       0.56      0.57      0.56       202
weighted avg       0.63      0.60      0.61       202



In [4]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.6584158415841584
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.80      0.76       140
           1       0.43      0.34      0.38        62

    accuracy                           0.66       202
   macro avg       0.58      0.57      0.57       202
weighted avg       0.64      0.66      0.65       202



In [5]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for Random Forest and Decision Tree classifiers
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest and Decision Tree classifiers
rf_classifier = RandomForestClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV for Random Forest and Decision Tree classifiers
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
rf_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Predicting on the test set using best estimators from grid search
rf_y_pred = rf_grid_search.predict(X_test)
dt_y_pred = dt_grid_search.predict(X_test)

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))


Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

Random Forest Classifier:
Accuracy: 0.7227722772277227
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.81       140
           1       0.57      0.37      0.45        62

    accuracy                           0.72       202
   macro avg       0.67      0.62      0.63       202
weighted avg       0.70      0.72      0.70       202


Decision Tree Classifier:
Accuracy: 0.6584158415841584
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.74      0.75       140
           1       0.45      0.48      0.47        62

    accuracy                           0.66       202
   macro avg       0.61      0.61      0.61       202
weighted avg       

In [6]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))


Best parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.6}

XGBoost Classifier:
Accuracy: 0.7128712871287128
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81       140
           1       0.56      0.32      0.41        62

    accuracy                           0.71       202
   macro avg       0.65      0.60      0.61       202
weighted avg       0.69      0.71      0.69       202



In [7]:
# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
# print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


[LightGBM] [Info] Number of positive: 246, number of negative: 559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 805, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305590 -> initscore=-0.820818
[LightGBM] [Info] Start training from score -0.820818
[LightGBM] [Info] Number of positive: 246, number of negative: 559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 805, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305590 -> initscore=-0.820818
[LightGBM] [Info] 