## Ensemble

In [5]:
# Libraries
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
import pickle
from pickle import load

root_path = "../../Data/GoogleDrive/"

In [6]:
# Load data
X_train = pd.read_parquet(root_path + "X_train.parquet")
X_test = pd.read_parquet(root_path + "X_test.parquet")
y_train = pd.read_parquet(root_path + "y_train.parquet")
y_test = pd.read_parquet(root_path + "y_test.parquet")

In [3]:
# Load models

# Root directory
root = '../../Data/GoogleDrive/'

# File names
log_name = 'logit_model_fixed.pkl'
knn_name = 'knn_model_fixed.pkl'
rf_name = 'rf_model_fixed.pkl'
xgb_name = 'xgboost_model_fixed.pkl'

with open(root + log_name, 'rb') as f:
    logit_model = pickle.load(f)

with open(root + knn_name, 'rb') as f:
    knn_model = pickle.load(f)

with open(root + rf_name, 'rb') as f:
    rf_model = pickle.load(f)

with open(root + xgb_name, 'rb') as f:
    xgb_model = pickle.load(f)

In [4]:
# Model
ensemble = VotingClassifier(
    estimators=[('logit', logit_model), 
                ('knn', knn_model),
                ('rf', rf_model),
                ('xgb', xgb_model)], 
                voting='soft')

dyld[63940]: Assertion failed: (this->magic == kMagic), function matchesPath, file Loader.cpp, line 154.

exception calling callback for <Future at 0x1745ea610 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ucla/lib/python3.11/site-packages/joblib/externals/loky/_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "/opt/anaconda3/envs/ucla/lib/python3.11/site-packages/joblib/parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "/opt/anaconda3/envs/ucla/lib/python3.11/site-packages/joblib/parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/ucla/lib/python3.11/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/envs/ucla/lib/python3.11/site-packages/joblib/parallel.py", line 819, in _dispatch

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGABRT(-6)}

In [5]:
print(ensemble)

VotingClassifier(estimators=[('logit',
                              GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=69, shuffle=True),
                                           estimator=Pipeline(steps=[('scaler',
                                                                      StandardScaler()),
                                                                     ('clf',
                                                                      LogisticRegression(max_iter=1000,
                                                                                         random_state=69))]),
                                           n_jobs=6,
                                           param_grid={'clf__C': array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007]),
                                                       'clf__class_weight': [None,
                                                                             'balanced'],
                                              

In [6]:
# Save ensemble model
ensemble_name = 'ensemble_model.pkl'
with open(root + ensemble_name, 'wb') as f:
    pickle.dump(ensemble, f)

In [8]:
# Load ensemble model
ensemble_name = 'ensemble_model.pkl'
with open(root_path + ensemble_name, 'rb') as f:
    ensemble_model = pickle.load(f)

In [11]:
# y_train = y_train.values.ravel()

ensemble_model.fit(X_train, y_train)
ensemble_model.score(X_test, y_test)
print('Ensemble model accuracy: ', ensemble_model.score(X_test, y_test))

# Predictions
y_pred = ensemble_model.predict(X_test)



Ensemble model accuracy:  0.86916645553585


In [12]:
# Save predictions
y_pred = pd.DataFrame(y_pred)
y_pred.to_parquet(root_path + 'ensemble_y_pred.parquet')

# Save fit model
with open(root_path + 'ensemble_model_fit.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)

In [16]:
# create confusion matrix
from sklearn.metrics import confusion_matrix

ensemble_cf = confusion_matrix(y_test, y_pred)
ensemble_cf = pd.DataFrame(ensemble_cf, 
                         columns = ['predicted_no_heart_disease', 
                                    'predicted_heart_disease'],
                         index = ['no_heart_disease',
                                  'has_heart_disease'])

In [14]:
# Evaluate

# Accuracy
ens_accuracy = accuracy_score(y_test, y_pred)

# Precision
ens_precision = precision_score(y_test, y_pred)

# Recall
ens_recall = recall_score(y_test, y_pred)

# F1
ens_f1 = f1_score(y_test, y_pred)

# AUC ROC
ens_roc = roc_auc_score(y_test, y_pred)

In [15]:
# Print
print('Accuracy: ', ens_accuracy)
print('Precision: ', ens_precision)
print('Recall: ', ens_recall)
print('F1: ', ens_f1)
print('AUC ROC: ', ens_roc)

Accuracy:  0.86916645553585
Precision:  0.4496545370796868
Recall:  0.41304899720741306
F1:  0.430575158786168
AUC ROC:  0.6721350353418308
