In [22]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
import pickle

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as XGB

import os


In [23]:
## Test cell for simon: Using Kaggle API to download the datasets indepent of github and its filesize limitations. Storing it in folder located outside of the repo.
# If this works, all filepaths have to be adjusted in all notebooks to make use of the downloaded datasets.
#RUN THIS CELL ONLY ONCE FOR ALL NOTEBOOKS!

from kaggle.api.kaggle_api_extended import KaggleApi

#configuring and authentification with kaggle api. This could be configured so that a authentification mask is shown?
api = KaggleApi()
api.authenticate()

#Configuring the metadata for the ecg heartbeat data (original username etc)
dataset_owner = "shayanfazeli"
dataset_name = "heartbeat"

#Configuring a download path that is NOT in the current github repo (so the big files are not pushed and cause an error!) --> Links to filepaths have to be dynamically adjusted
download_path = "../data/KAGGLE_datasets" #In this case we use the data folder that is in the .gitignore list and therefore not pushed! To keep everything in one local repo.

# Download structure: First check if dataset is already downloaded, else download it and store it in download path (should be outside git repo!)
dataset_folder = os.path.join(download_path, dataset_name)
if not os.path.exists(dataset_folder):
    # Case 1: Dataset path is not created --> Create it and download datasets into it
    api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True)
    print("Datasets are downloaded and unzipped.")
else:
    # Case 2: Folder is created, but datasets might be missing
    missing_files = [] 
    for file_name in ["mitbih_test.csv", "mitbih_train.csv", "ptbdb_abnormal.csv", "ptbdb_normal.csv"]:  # These are the hardcoded names of the datasets that should be downloaded.
        file_path = os.path.join(dataset_folder, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)

    if missing_files:
        # If the list contains missing files, download ALL files and overwrite the old folder.
        api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True, force=True)
        print("Missing data was donwloaded and unzipped. All Datasets are now available.")
    else:
        print("All Datasets are already available.")

#Creating new variable that links to the datasets and can be used in the rest of the code.
path_to_datasets = download_path + "/" + dataset_name 

All Datasets are already available.


In [24]:
#only 4 digits are printed out by numpy calculations.
np.set_printoptions(precision=4)

In [25]:
# This cell now makes use of the downloadfolder for the datasets. If already available locally, the filepaths can be changed.
ptbdb_normal= pd.read_csv(path_to_datasets + "/" +  'ptbdb_normal.csv', header=None)
ptbdb_abnormal=pd.read_csv(path_to_datasets + "/" +   'ptbdb_abnormal.csv',header=None)

#PTBDB comes preconfigures as normal and abnormal dataset, so we concate both, reshuffle them and then generate the test and train sets.
ptbdb_combined = pd.concat([ptbdb_normal, ptbdb_abnormal], ignore_index=True, axis=0)

#Reshuffle the whole new dataframe
ptbdb_combined_shuffled = ptbdb_combined.sample(frac=1, random_state=42)

#Generate Test and Train datasets
X = ptbdb_combined_shuffled.iloc[:, :-1] #All values except the last column
y = ptbdb_combined_shuffled.iloc[:, -1] #All values from the last column

train, test, train_target, test_target = train_test_split(X, y, test_size=0.20, random_state=42)

In [26]:
#Switches for the user to define which sample method is used and which models are run.
class Config:
    oversample = False #refers to PTBDB B_SMOTE
    undersample = False #Refers to undersampling with random undersampler
    sample_name = "UNDEFINED_SAMPLE"

Train_SVM =  True #trains the SVM Model without Gridsearch
Train_KNN = True #trains the KNN Model without Gridsearch
Train_DTC = True #trains the DTC Model without Gridsearch
Train_RF = True #trains the RF Model without Gridsearch
Train_XGB = True #trains the XGB Model without Gridsearch

In [27]:
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [28]:
#Based on the user settings, Resampling is done and the sample name (i.e. filenames) are modified.
if Config.oversample:
    train, train_target = oversampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "PTBDB_B_SMOTE"
    print("Sample Name:", Config.sample_name)
elif Config.undersample:
    train, train_target = undersampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "PTBDB_C_RUS"
    print("Sample Name:", Config.sample_name)
else: 
    print("Using the original PTBDB dataset")
    Config.sample_name = "PTBDB_A_Original"
    print("Sample Name:", Config.sample_name)

Using the original PTBDB dataset
Sample Name: PTBDB_A_Original


In [29]:
#Function to save models and classification report directly after running.
def save_model_and_report(model, report, model_filename, report_filename, model_folder="../models/ML_Models", report_folder="../reports/figures/ML_Models"):
    # Save the model
    model_savepath = os.path.join(model_folder, model_filename)
    with open(model_savepath, "wb") as f:
        pickle.dump(model, f)
    print(f"The model was saved as {model_filename} in folder {model_folder}.")

    # Check if model file size is greater than 98MB (Restriction for github only!)
    if os.path.getsize(model_savepath) > 98 * 1024 * 1024:  # Check if size is greater than 98MB
        print("Model file size is too big. Changing save path...")
        model_folder = "../data/models_too_big_for_git"
        os.makedirs(model_folder, exist_ok=True)  # Create the directory if it doesn't exist
        new_model_savepath = os.path.join(model_folder, model_filename)
        os.replace(model_savepath, new_model_savepath)  # Move the model to the new location
        print(f"Model moved to {model_folder} due to its size.")

    # Save the classification report
    report_savepath = os.path.join(report_folder, report_filename)
    with open(report_savepath, "w") as f:
        f.write(report)
    print(f"The classification report was saved as {report_filename} in folder {report_folder}.")

In [30]:
train.shape

(11641, 187)

# **SVM**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [17]:
#Just the code for model creation, fitting and creating the report out of the predictions.
if Train_SVM == True:
    model = SVC(cache_size=500)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the SVM Model
    model_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("SVM Model withoud gridsearch is not trained")

              precision    recall  f1-score   support

         0.0     0.8743    0.8037    0.8375       805
         1.0     0.9272    0.9558    0.9413      2106

    accuracy                         0.9138      2911
   macro avg     0.9008    0.8798    0.8894      2911
weighted avg     0.9126    0.9138    0.9126      2911

The model was saved as SVM_Basemodel_no_gridsearch_PTBDB_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as SVM_Basemodel_no_gridsearch_PTBDB_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **KNN**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [18]:
if Train_KNN == True:
    model = KNN(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the KNN Model
    model_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("KNN Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.8550    0.9081    0.8807       805
         1.0     0.9640    0.9411    0.9524      2106

    accuracy                         0.9320      2911
   macro avg     0.9095    0.9246    0.9166      2911
weighted avg     0.9339    0.9320    0.9326      2911

The model was saved as KNN_Basemodel_no_gridsearch_PTBDB_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as KNN_Basemodel_no_gridsearch_PTBDB_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **Decision Tree**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [16]:
if Train_DTC == True:
    model = DTC()
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Decision Tree Model
    model_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Decision Tree Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.8694    0.8683    0.8689       805
         1.0     0.9497    0.9501    0.9499      2106

    accuracy                         0.9275      2911
   macro avg     0.9095    0.9092    0.9094      2911
weighted avg     0.9275    0.9275    0.9275      2911

The model was saved as DTC_Basemodel_no_gridsearch_PTBDB_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as DTC_Basemodel_no_gridsearch_PTBDB_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **Random Forest**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [19]:
if Train_RF == True:
    model = RFC(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Random Forest Model
    model_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Random Forest Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9750    0.9217    0.9476       805
         1.0     0.9707    0.9910    0.9807      2106

    accuracy                         0.9718      2911
   macro avg     0.9729    0.9564    0.9642      2911
weighted avg     0.9719    0.9718    0.9716      2911

The model was saved as RFC_Basemodel_no_gridsearch_PTBDB_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as RFC_Basemodel_no_gridsearch_PTBDB_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **XGBoost**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [31]:
if Train_XGB == True:
    model = XGB.XGBClassifier(objective='binary:logistic')
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the XGB Model
    model_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("XBG Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9710    0.9553    0.9631       805
         1.0     0.9830    0.9891    0.9860      2106

    accuracy                         0.9797      2911
   macro avg     0.9770    0.9722    0.9745      2911
weighted avg     0.9797    0.9797    0.9797      2911

The model was saved as XGB_Basemodel_no_gridsearch_PTBDB_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as XGB_Basemodel_no_gridsearch_PTBDB_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.
