In [86]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
import pickle

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as XGB

import os


In [87]:
## Test cell for simon: Using Kaggle API to download the datasets indepent of github and its filesize limitations. Storing it in folder located outside of the repo.
# If this works, all filepaths have to be adjusted in all notebooks to make use of the downloaded datasets.
#RUN THIS CELL ONLY ONCE FOR ALL NOTEBOOKS!

from kaggle.api.kaggle_api_extended import KaggleApi

#configuring and authentification with kaggle api. This could be configured so that a authentification mask is shown?
api = KaggleApi()
api.authenticate()

#Configuring the metadata for the ecg heartbeat data (original username etc)
dataset_owner = "shayanfazeli"
dataset_name = "heartbeat"

#Configuring a download path that is NOT in the current github repo (so the big files are not pushed and cause an error!) --> Links to filepaths have to be dynamically adjusted
download_path = "../data/KAGGLE_datasets" #In this case we use the data folder that is in the .gitignore list and therefore not pushed! To keep everything in one local repo.

# Download structure: First check if dataset is already downloaded, else download it and store it in download path (should be outside git repo!)
dataset_folder = os.path.join(download_path, dataset_name)
if not os.path.exists(dataset_folder):
    # Case 1: Dataset path is not created --> Create it and download datasets into it
    api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True)
    print("Datasets are downloaded and unzipped.")
else:
    # Case 2: Folder is created, but datasets might be missing
    missing_files = [] 
    for file_name in ["mitbih_test.csv", "mitbih_train.csv", "ptbdb_abnormal.csv", "ptbdb_normal.csv"]:  # These are the hardcoded names of the datasets that should be downloaded.
        file_path = os.path.join(dataset_folder, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)

    if missing_files:
        # If the list contains missing files, download ALL files and overwrite the old folder.
        api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True, force=True)
        print("Missing data was donwloaded and unzipped. All Datasets are now available.")
    else:
        print("All Datasets are already available.")

#Creating new variable that links to the datasets and can be used in the rest of the code.
path_to_datasets = download_path + "/" + dataset_name 

All Datasets are already available.


In [88]:
#only 4 digits are printed out by numpy calculations.
np.set_printoptions(precision=4)

In [89]:
# This cell now makes use of the downloadfolder for the datasets. If already available locally, the filepaths can be changed.
df_train= pd.read_csv(path_to_datasets + "/" + 'mitbih_train.csv', header=None)
df_test=pd.read_csv(path_to_datasets + "/" +  'mitbih_test.csv',header=None)

#split target and value
train_target=df_train[187]
test_target=df_test[187]
train=df_train.drop(187,axis=1)
test=df_test.drop(187,axis=1)

In [90]:
#Switches for the user to define which sample method is used and which models are run.
class Config:
    oversample = False #refers to mitbih B_SMOTE
    undersample = False #Refers to undersampling with random undersampler
    sample_name = "UNDEFINED_SAMPLE"

Train_SVM =  False #trains the SVM Model without Gridsearch
Train_KNN = False #trains the KNN Model without Gridsearch
Train_DTC = False #trains the DTC Model without Gridsearch
Train_RF = False #trains the RF Model without Gridsearch
Train_XGB = False #trains the XGB Model without Gridsearch


In [91]:
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [92]:
#Based on the user settings, Resampling is done and the sample name (i.e. filenames) are modified.
if Config.oversample:
    train, train_target = oversampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_B_SMOTE"
    print("Sample Name:", Config.sample_name)
elif Config.undersample:
    train, train_target = undersampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_C_RUS"
    print("Sample Name:", Config.sample_name)
else: 
    print("Using the original mitbih dataset")
    Config.sample_name = "MITBIH_A_Original"
    print("Sample Name:", Config.sample_name)

Using the original mitbih dataset
Sample Name: MITBIH_A_Original


In [93]:
#Function to save models and classification report directly after running.
def save_model_and_report(model, report, model_filename, report_filename, model_folder="../models/ML_Models", report_folder="../reports/figures/ML_Models"):
    # Save the model
    model_savepath = os.path.join(model_folder, model_filename)
    with open(model_savepath, "wb") as f:
        pickle.dump(model, f)
    print(f"The model was saved as {model_filename} in folder {model_folder}.")

    # Check if model file size is greater than 98MB (Restriction for github only!)
    if os.path.getsize(model_savepath) > 98 * 1024 * 1024:  # Check if size is greater than 98MB
        print("Model file size is too big. Changing save path...")
        model_folder = "../data/models_too_big_for_git"
        os.makedirs(model_folder, exist_ok=True)  # Create the directory if it doesn't exist
        new_model_savepath = os.path.join(model_folder, model_filename)
        os.replace(model_savepath, new_model_savepath)  # Move the model to the new location
        print(f"Model moved to {model_folder} due to its size.")

    # Save the classification report
    report_savepath = os.path.join(report_folder, report_filename)
    with open(report_savepath, "w") as f:
        f.write(report)
    print(f"The classification report was saved as {report_filename} in folder {report_folder}.")



In [94]:
train.shape

(87554, 187)

# **SVM**


In [95]:
#Just the code for model creation, fitting and creating the report out of the predictions.
if Train_SVM == True:
    model = SVC(cache_size=500)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the SVM Model
    model_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("SVM Model withoud gridsearch is not trained")

SVM Model withoud gridsearch is not trained


# **KNN**


In [96]:
if Train_KNN == True:
    model = KNN(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the KNN Model
    model_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("KNN Model without gridsearch is not trained.")

KNN Model without gridsearch is not trained.


# **Decision Tree**


In [97]:
if Train_DTC == True:
    model = DTC()
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Decision Tree Model
    model_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Decision Tree Model without gridsearch is not trained.")

Decision Tree Model without gridsearch is not trained.


# **Random Forest**


In [98]:
if Train_RF == True:
    model = RFC(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Random Forest Model
    model_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Random Forest Model without gridsearch is not trained.")

Random Forest Model without gridsearch is not trained.


# **XGBoost**


In [99]:
if Train_XGB == True:
    model = XGB.XGBClassifier(objective='binary:logistic')
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the XGB Model
    model_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("XBG Model without gridsearch is not trained.")

XBG Model without gridsearch is not trained.


# Gridsearch Section

We present in the following section the gridsearches, that each team member did. Therefore, the specific codes are presented. If possible,
the codes are rewritten to be more compact and fit the overall style of this final notebook. Also the filesaving code lines are added. Other than that, the original code is reused.

If files from the gridsearch were pushed to git, the following must be kept in mind:
- Dataset C is the smallest and was used for debugging the code, but not used in the reports
- Dataset B is way too big to be run on a single computer. If needed, the provided code can be run on Kaggle with some changes to the filepaths. Since Dataset B Gridsearch was also not included in the report, we do not provide the files. The code can generate the gridsearch results for Dataset B if needed.
- Dataset A was redone and pushed to github.

In [100]:
#necessary imports
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

In [108]:
# Configuration switches / Paramgrids
Gridsearch_SVM = False # (Done by Hakan?)
Gridsearch_KNN = False # (Done by Alex)
Gridsearch_DTC = False # (Done by Alex)
Gridsearch_RFC = False # (Done by Simon)
Gridsearch_XGB = False # (Done by Hakan)

#print the current dataset that is in usage:
print("Current Dataset that is used for the gridsearch:", Config.sample_name)

Current Dataset that is used for the gridsearch: MITBIH_A_Original


## Gridsearch SVM

In [109]:
if Gridsearch_SVM == True:
    svm_model = SVC()
    params = {'C': [1, 10, 100], 'kernel': [ 'linear','rbf']}
    grid_search = GridSearchCV(estimator=svm_model, param_grid=params, verbose=3, cv=5, n_jobs=-1)
    grid_search.fit(train, train_target)
    print("Best parameters:", grid_search.best_params_)
    y_pred = grid_search.predict(test)
    print(pd.crosstab(test_target, y_pred, rownames=['Class Actual'], colnames=['Class Predict']))
    print(classification_report(test_target, y_pred))

    print("Accuracy Score on Test Set: ", grid_search.score(test, test_target))
    model_filename = f"SVM_Optimized_Model_with_Gridsearch_{Config.sample_name}.pkl"
    report_filename = f"SVM_Optimized_Model_with_Gridsearch_{Config.sample_name}_classification_report.txt"
    # Saving the best model and the associated classification report
    save_model_and_report(grid_search.best_estimator_, classification_report(test_target, y_pred), model_filename, report_filename)

else:
    print("No Gridsearch for SVM is performed.")


No Gridsearch for SVM is performed.


## Gridsearch KNN

In [110]:
if Gridsearch_KNN == True:
    knn = neighbors.KNeighborsClassifier(n_jobs=-1)
    param_knn = {'metric': ['manhattan'],'n_neighbors': [1]} #,'minkowski','chebyshev','euclidean' // k for  k in range(1, 7)
    grid=GridSearchCV(estimator=knn,param_grid=param_knn,verbose=3, cv=5 , scoring='accuracy', n_jobs=-1)
    grid.fit(train,train_target)
    print(pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'std_test_score']])

    print('Best Metric:', grid.best_estimator_.get_params()['metric'])
    print('Best K:', grid.best_estimator_.get_params()['n_neighbors'])

    y_pred=grid.predict(test)
    print(pd.crosstab(test_target,y_pred,rownames=['Class Actual'],colnames=['Class Predict']))
    print(classification_report(test_target,y_pred))


    print("Accuracy Score on Test Set: ", grid.score(test, test_target))
    model_filename = f"KNN_Optimized_Model_with_Gridsearch_{Config.sample_name}.pkl"
    report_filename = f"KNN_Optimized_Model_with_Gridsearch_{Config.sample_name}_classification_report.txt"
    #saving the best model and the associated classification report
    save_model_and_report(grid.best_estimator_, classification_report(test_target, y_pred), model_filename, report_filename)
else:
    print("No Gridsearch for KNN is performed.")

No Gridsearch for KNN is performed.


## Gridsearch DTC

In [111]:
if Gridsearch_DTC == True:
    dt=DecisionTreeClassifier()
    param_dt={'criterion': ['entropy'],   'max_depth': [12]} # 'gini', // 2,4,6,8,10,
    grid=GridSearchCV(estimator=dt,param_grid=param_dt, verbose=3, cv=5 , scoring='accuracy', n_jobs=-1)
    grid.fit(train, train_target)
    pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'std_test_score']]

    print('Best Metric:', grid.best_estimator_.get_params()['criterion'])
    print('Best Depth:', grid.best_estimator_.get_params()['max_depth'])

    y_pred=grid.predict(test)
    print(pd.crosstab(test_target,y_pred,rownames=['Class Actual'],colnames=['Class Predict']))
    print(classification_report(test_target,y_pred))

    print("Accuracy Score on Test Set: ", grid.score(test, test_target))
    model_filename = f"DTC_Optimized_Model_with_Gridsearch_{Config.sample_name}.pkl"
    report_filename = f"DTC_Optimized_Model_with_Gridsearch_{Config.sample_name}_classification_report.txt"
    #saving the best model and the associated classification report
    save_model_and_report(grid.best_estimator_, classification_report(test_target, y_pred), model_filename, report_filename)
else:
    print("No Gridsearch for DTC is performed.")

No Gridsearch for DTC is performed.


## Gridsearch RFC

In [112]:
if Gridsearch_RFC == True:
    param_grid = {
        'n_estimators': [200],  #50, 100, 
        'criterion': ['entropy'], #'gini',
        'max_depth': [None], #, 10, 20
        'min_samples_split': [2], #, 5 
        'min_samples_leaf': [1], #, 2, 4 
        'max_features': ['sqrt'] #, 'log2'    
    }
    rfc_grid = RandomForestClassifier(n_jobs=-1)
    grid_search = GridSearchCV(estimator = rfc_grid, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs = -1, verbose=3)
    grid_search.fit(train, train_target)
    print("Best parameters:", grid_search.best_params_)
    y_pred=grid_search.predict(test)
    print(pd.crosstab(test_target,y_pred,rownames=['Class Actual'],colnames=['Class Predict']))
    print(classification_report(test_target,y_pred))

    print("Accuracy Score on Test Set: ", grid_search.score(test, test_target))
    model_filename = f"RFC_Optimized_Model_with_Gridsearch_{Config.sample_name}.pkl"
    report_filename = f"RFC_Optimized_Model_with_Gridsearch_{Config.sample_name}_classification_report.txt"
    #saving the best model and the associated classification report
    save_model_and_report(grid_search.best_estimator_, classification_report(test_target, y_pred), model_filename, report_filename)

else:
    print("No Gridsearch for RFC is performed.")



No Gridsearch for RFC is performed.


## Gridsearch XGB


In [113]:
if Gridsearch_XGB == True:
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',  # for binary classification
        n_jobs=-1,
        #max_depth=10,
        #learning_rate=0.01,
        #n_estimators=100
    )
    params={'max_depth':[50], 'n_estimators':[2000], 'learning_rate': [0.2]} #1500, 2000, 2500 // 0.1, 0.2, 0.5
    grid_search = GridSearchCV(estimator=xgb_model,param_grid=params, verbose=3, cv=5, n_jobs=-1)
    grid_search.fit(train, train_target)
    print("Best parameters:", grid_search.best_params_)
    y_pred=grid_search.predict(test)
    print(pd.crosstab(test_target,y_pred,rownames=['Class Actual'],colnames=['Class Predict']))
    print(classification_report(test_target,y_pred))

    print("Accuracy Score on Test Set: ", grid_search.score(test, test_target))
    model_filename = f"XGB_Optimized_Model_with_Gridsearch_{Config.sample_name}.pkl"
    report_filename = f"XGB_Optimized_Model_with_Gridsearch_{Config.sample_name}_classification_report.txt"
    #saving the best model and the associated classification report
    save_model_and_report(grid_search.best_estimator_, classification_report(test_target, y_pred), model_filename, report_filename)

else:
    print("No Gridsearch for XBG is performed.")


No Gridsearch for XBG is performed.
