In [1]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
from sklearn import model_selection as sk_model_selection
from sklearn import preprocessing as sk_preprocessing
from sklearn import linear_model as sk_linear_model
from sklearn import svm as sk_svm
from sklearn import tree as sk_tree
from sklearn import ensemble as sk_ensemble
from sklearn import neighbors as sk_neighbors
from sklearn import metrics as sk_metrics

In [4]:
import lightgbm as lgbm

In [29]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)


SEED = 10
set_seed(SEED)

In [31]:
def print_metrics(y_true, y_pred):
    accuracy = sk_metrics.accuracy_score(y_true, y_pred)
    f1 = sk_metrics.f1_score(y_true, y_pred)
    precision = sk_metrics.precision_score(y_true, y_pred)
    recall = sk_metrics.recall_score(y_true, y_pred)
    
    print(f'Accuracy (test set)\t| {accuracy:.4f}')
    print(f'F1 (test set)\t\t| {f1:.4f}')
    print(f'Precision (test set)\t| {precision:.4f}')
    print(f'Recall (test set)\t| {recall:.4f}')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [32]:
def fit_model_with_grid_search(model, parameters, scoring='f1', verbose=1):
    model = sk_model_selection.GridSearchCV(
        model,
        parameters,
        scoring=scoring
    )
    
    model.fit(X_train, y_train)
    
    if verbose:
        print(f'best_params_: {model.best_params_}')
        print(f'Mean cross-validated F1 score of the best_estimator: {model.best_score_:.4f}')
        
    return model


In [33]:
dict_results = {}

# Data Loading and Spliting

In [9]:
df = pd.read_csv(r'C:\Users\Atharva Pathak\Desktop\Heart\heart_failure_clinical_records_dataset.csv')
print(df.shape)
df.head(5)

(299, 13)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [34]:
X_train, X_test, y_train, y_test = sk_model_selection.train_test_split(
    X_data, 
    y_data, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True, 
    stratify=y_data
)
print(f'X_train shape: {X_train.shape} y_train.shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape} y_test.shape: {y_test.shape}')

X_train shape: (239, 12) y_train.shape: (239,)
X_test shape: (60, 12) y_test.shape: (60,)


# Data Scaling

In [35]:
scaler = sk_preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [36]:
model_logistic_regression = sk_linear_model.LogisticRegression(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    'C': [0.01, 0.1, 1],
}

model_logistic_regression = fit_model_with_grid_search(
    model_logistic_regression,
    parameters,
    scoring='f1',
)

y_test_pred = model_logistic_regression.predict(X_test)

print()
dict_results['Logistic Regression'] = print_metrics(y_test, y_test_pred)


best_params_: {'C': 0.1}
Mean cross-validated F1 score of the best_estimator: 0.7351

Accuracy (test set)	| 0.8000
F1 (test set)		| 0.6667
Precision (test set)	| 0.7059
Recall (test set)	| 0.6316




# Support Vector Machine

In [38]:
model_svc = sk_svm.SVC(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    'C': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

model_svc = fit_model_with_grid_search(
    model_svc,
    parameters,
    scoring='f1',
)

y_test_pred = model_svc.predict(X_test)

print()
dict_results['SVC'] = print_metrics(y_test, y_test_pred)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


best_params_: {'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}
Mean cross-validated F1 score of the best_estimator: 0.7427

Accuracy (test set)	| 0.7500
F1 (test set)		| 0.6154
Precision (test set)	| 0.6000
Recall (test set)	| 0.6316


# Decision Tree 

In [39]:
model_decision_tree = sk_tree.DecisionTreeClassifier(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    "max_depth": [1, 2, 3, 5, 20, None], 
    "min_samples_leaf": [1, 5, 10, 20],
}

model_decision_tree = fit_model_with_grid_search(
    model_decision_tree,
    parameters,
    scoring='f1',
)

y_test_pred = model_decision_tree.predict(X_test)

print()
dict_results['Decision Tree'] = print_metrics(y_test, y_test_pred)

best_params_: {'max_depth': 2, 'min_samples_leaf': 20}
Mean cross-validated F1 score of the best_estimator: 0.7407

Accuracy (test set)	| 0.7833
F1 (test set)		| 0.6977
Precision (test set)	| 0.6250
Recall (test set)	| 0.7895




# Random Forest

In [40]:
model_random_forest = sk_ensemble.RandomForestClassifier(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    "n_estimators": [5, 10, 15, 20],
    "max_depth": [1, 2, 3, 5, 10, None],
    "min_samples_leaf": [1, 5, 10, 20]
}

model_random_forest = fit_model_with_grid_search(
    model_random_forest,
    parameters,
    scoring='f1',
)

y_test_pred = model_random_forest.predict(X_test)

print()
dict_results['Random Forest'] = print_metrics(y_test, y_test_pred)



best_params_: {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 15}
Mean cross-validated F1 score of the best_estimator: 0.8047

Accuracy (test set)	| 0.8500
F1 (test set)		| 0.7097
Precision (test set)	| 0.9167
Recall (test set)	| 0.5789




# K-Nearest Neighbour

In [41]:
model_k_neighbors = sk_neighbors.KNeighborsClassifier()

parameters = {
    "n_neighbors": list(range(1, 11)),
    "weights": ['uniform', 'distance'],
}

model_k_neighbors = fit_model_with_grid_search(
    model_k_neighbors,
    parameters,
    scoring='f1',
)

y_test_pred = model_k_neighbors.predict(X_test)

print()
dict_results['K-Neighbors'] = print_metrics(y_test, y_test_pred)


best_params_: {'n_neighbors': 5, 'weights': 'uniform'}
Mean cross-validated F1 score of the best_estimator: 0.5368

Accuracy (test set)	| 0.7000
F1 (test set)		| 0.3077
Precision (test set)	| 0.5714
Recall (test set)	| 0.2105




# LightGBM

In [42]:
model_lgbm = lgbm.LGBMClassifier(
    class_weight='balanced',
    random_state=SEED,
)

parameters = {
    'num_leaves': [7, 15, 31],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [1],
    'reg_lambda': [1],
    'colsample_bytree': [0.5, 0.75, 1.]
}

model_lgbm = fit_model_with_grid_search(
    model_lgbm,
    parameters,
    scoring='f1',
)

y_test_pred = model_lgbm.predict(X_test)

print()
dict_results['LightGBM'] = print_metrics(y_test, y_test_pred)



best_params_: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 200, 'num_leaves': 7, 'reg_alpha': 1, 'reg_lambda': 1}
Mean cross-validated F1 score of the best_estimator: 0.7748

Accuracy (test set)	| 0.8333
F1 (test set)		| 0.7059
Precision (test set)	| 0.8000
Recall (test set)	| 0.6316




# Final Comparison

In [43]:
pd.DataFrame(dict_results).T

Unnamed: 0,accuracy,f1,precision,recall
Logistic Regression,0.8,0.666667,0.705882,0.631579
SVC,0.75,0.615385,0.6,0.631579
Decision Tree,0.783333,0.697674,0.625,0.789474
Random Forest,0.85,0.709677,0.916667,0.578947
K-Neighbors,0.7,0.307692,0.571429,0.210526
LightGBM,0.833333,0.705882,0.8,0.631579
