In [41]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
import pickle

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as XGB

import os


In [33]:
## Test cell for simon: Using Kaggle API to download the datasets indepent of github and its filesize limitations. Storing it in folder located outside of the repo.
# If this works, all filepaths have to be adjusted in all notebooks to make use of the downloaded datasets.
#RUN THIS CELL ONLY ONCE FOR ALL NOTEBOOKS!

from kaggle.api.kaggle_api_extended import KaggleApi

#configuring and authentification with kaggle api. This could be configured so that a authentification mask is shown?
api = KaggleApi()
api.authenticate()

#Configuring the metadata for the ecg heartbeat data (original username etc)
dataset_owner = "shayanfazeli"
dataset_name = "heartbeat"

#Configuring a download path that is NOT in the current github repo (so the big files are not pushed and cause an error!) --> Links to filepaths have to be dynamically adjusted
download_path = "../data/KAGGLE_datasets" #In this case we use the data folder that is in the .gitignore list and therefore not pushed! To keep everything in one local repo.

# Download structure: First check if dataset is already downloaded, else download it and store it in download path (should be outside git repo!)
dataset_folder = os.path.join(download_path, dataset_name)
if not os.path.exists(dataset_folder):
    # Case 1: Dataset path is not created --> Create it and download datasets into it
    api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True)
    print("Datasets are downloaded and unzipped.")
else:
    # Case 2: Folder is created, but datasets might be missing
    missing_files = [] 
    for file_name in ["mitbih_test.csv", "mitbih_train.csv", "ptbdb_abnormal.csv", "ptbdb_normal.csv"]:  # These are the hardcoded names of the datasets that should be downloaded.
        file_path = os.path.join(dataset_folder, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)

    if missing_files:
        # If the list contains missing files, download ALL files and overwrite the old folder.
        api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True, force=True)
        print("Missing data was donwloaded and unzipped. All Datasets are now available.")
    else:
        print("All Datasets are already available.")

#Creating new variable that links to the datasets and can be used in the rest of the code.
path_to_datasets = download_path + "/" + dataset_name 

All Datasets are already available.


In [32]:
#only 4 digits are printed out by numpy calculations.
np.set_printoptions(precision=4)

In [34]:
# This cell now makes use of the downloadfolder for the datasets. If already available locally, the filepaths can be changed.
df_train= pd.read_csv(path_to_datasets + "/" + 'mitbih_train.csv', header=None)
df_test=pd.read_csv(path_to_datasets + "/" +  'mitbih_test.csv',header=None)

#split target and value
train_target=df_train[187]
test_target=df_test[187]
train=df_train.drop(187,axis=1)
test=df_test.drop(187,axis=1)

In [36]:
#Switches for the user to define which sample method is used and which models are run.
class Config:
    oversample = False #refers to mitbih B_SMOTE
    undersample = False #Refers to undersampling with random undersampler
    sample_name = "UNDEFINED_SAMPLE"

Train_SVM =  True #trains the SVM Model without Gridsearch
Train_KNN = True #trains the KNN Model without Gridsearch
Train_DTC = True #trains the DTC Model without Gridsearch
Train_RF = True #trains the RF Model without Gridsearch
Train_XGB = True #trains the XGB Model without Gridsearch


In [35]:
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [38]:
#Based on the user settings, Resampling is done and the sample name (i.e. filenames) are modified.
if Config.oversample:
    train, train_target = oversampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_B_SMOTE"
    print("Sample Name:", Config.sample_name)
elif Config.undersample:
    train, train_target = undersampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_C_RUS"
    print("Sample Name:", Config.sample_name)
else: 
    print("Using the original mitbih dataset")
    Config.sample_name = "MITBIH_A_Original"
    print("Sample Name:", Config.sample_name)

Sample Name: MITBIH_C_RUS


In [37]:
#Function to save models and classification report directly after running.
def save_model_and_report(model, report, model_filename, report_filename, model_folder="../models/ML_Models", report_folder="../reports/figures/ML_Models"):
    # Save the model
    model_savepath = os.path.join(model_folder, model_filename)
    with open(model_savepath, "wb") as f:
        pickle.dump(model, f)
    print(f"The model was saved as {model_filename} in folder {model_folder}.")

    # Check if model file size is greater than 98MB (Restriction for github only!)
    if os.path.getsize(model_savepath) > 98 * 1024 * 1024:  # Check if size is greater than 98MB
        print("Model file size is too big. Changing save path...")
        model_folder = "../data/models_too_big_for_git"
        os.makedirs(model_folder, exist_ok=True)  # Create the directory if it doesn't exist
        new_model_savepath = os.path.join(model_folder, model_filename)
        os.replace(model_savepath, new_model_savepath)  # Move the model to the new location
        print(f"Model moved to {model_folder} due to its size.")

    # Save the classification report
    report_savepath = os.path.join(report_folder, report_filename)
    with open(report_savepath, "w") as f:
        f.write(report)
    print(f"The classification report was saved as {report_filename} in folder {report_folder}.")



In [39]:
train.shape

(3205, 187)

# **SVM**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [42]:
#Just the code for model creation, fitting and creating the report out of the predictions.
if Train_SVM == True:
    model = SVC(cache_size=500)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the SVM Model
    model_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("SVM Model withoud gridsearch is not trained")

              precision    recall  f1-score   support

         0.0     0.9770    0.8097    0.8855     18118
         1.0     0.2998    0.6817    0.4165       556
         2.0     0.6591    0.8398    0.7385      1448
         3.0     0.0778    0.8951    0.1432       162
         4.0     0.7851    0.9291    0.8510      1608

    accuracy                         0.8179     21892
   macro avg     0.5598    0.8311    0.6070     21892
weighted avg     0.9180    0.8179    0.8559     21892

The model was saved as SVM_Basemodel_no_gridsearch_MITBIH_C_RUS.pkl in folder ../models/ML_Models.
The classification report was saved as SVM_Basemodel_no_gridsearch_MITBIH_C_RUS_classification_report.txt in folder ../reports/figures/ML_Models.


# **KNN**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [40]:
if Train_KNN == True:
    model = KNN(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the KNN Model
    model_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("KNN Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9822    0.7693    0.8628     18118
         1.0     0.1821    0.7518    0.2931       556
         2.0     0.5694    0.8191    0.6718      1448
         3.0     0.0967    0.9074    0.1748       162
         4.0     0.8323    0.9322    0.8794      1608

    accuracy                         0.7852     21892
   macro avg     0.5325    0.8360    0.5764     21892
weighted avg     0.9170    0.7852    0.8319     21892

The model was saved as KNN_Basemodel_no_gridsearch_MITBIH_C_RUS.pkl in folder ../models/ML_Models.
The classification report was saved as KNN_Basemodel_no_gridsearch_MITBIH_C_RUS_classification_report.txt in folder ../reports/figures/ML_Models.


# **Decision Tree**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [44]:
if Train_DTC == True:
    model = DTC()
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Decision Tree Model
    model_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Decision Tree Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9810    0.6933    0.8125     18118
         1.0     0.1296    0.7410    0.2206       556
         2.0     0.5136    0.7956    0.6242      1448
         3.0     0.0949    0.8704    0.1712       162
         4.0     0.6729    0.9123    0.7746      1608

    accuracy                         0.7187     21892
   macro avg     0.4784    0.8025    0.5206     21892
weighted avg     0.8993    0.7187    0.7775     21892

The model was saved as DTC_Basemodel_no_gridsearch_MITBIH_C_RUS.pkl in folder ../models/ML_Models.
The classification report was saved as DTC_Basemodel_no_gridsearch_MITBIH_C_RUS_classification_report.txt in folder ../reports/figures/ML_Models.


# **Random Forest**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [43]:
if Train_RF == True:
    model = RFC(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Random Forest Model
    model_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Random Forest Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9865    0.8517    0.9142     18118
         1.0     0.2735    0.7968    0.4072       556
         2.0     0.7317    0.8812    0.7995      1448
         3.0     0.1337    0.9012    0.2329       162
         4.0     0.8533    0.9515    0.8997      1608

    accuracy                         0.8600     21892
   macro avg     0.5957    0.8765    0.6507     21892
weighted avg     0.9355    0.8600    0.8876     21892

The model was saved as RFC_Basemodel_no_gridsearch_MITBIH_C_RUS.pkl in folder ../models/ML_Models.
The classification report was saved as RFC_Basemodel_no_gridsearch_MITBIH_C_RUS_classification_report.txt in folder ../reports/figures/ML_Models.


# **XGBoost**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [45]:
if Train_XGB == True:
    model = XGB.XGBClassifier(objective='binary:logistic')
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the XGB Model
    model_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("XBG Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9892    0.8345    0.9053     18118
         1.0     0.2268    0.8291    0.3561       556
         2.0     0.7045    0.9054    0.7924      1448
         3.0     0.1588    0.8951    0.2698       162
         4.0     0.8556    0.9577    0.9038      1608

    accuracy                         0.8486     21892
   macro avg     0.5870    0.8844    0.6455     21892
weighted avg     0.9350    0.8486    0.8791     21892

The model was saved as XGB_Basemodel_no_gridsearch_MITBIH_C_RUS.pkl in folder ../models/ML_Models.
The classification report was saved as XGB_Basemodel_no_gridsearch_MITBIH_C_RUS_classification_report.txt in folder ../reports/figures/ML_Models.
