In [88]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
import pickle

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as XGB

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [89]:
## Test cell for simon: Using Kaggle API to download the datasets indepent of github and its filesize limitations. Storing it in folder located outside of the repo.
# If this works, all filepaths have to be adjusted in all notebooks to make use of the downloaded datasets.
#RUN THIS CELL ONLY ONCE FOR ALL NOTEBOOKS!

from kaggle.api.kaggle_api_extended import KaggleApi

#configuring and authentification with kaggle api. This could be configured so that a authentification mask is shown?
api = KaggleApi()
api.authenticate()

#Configuring the metadata for the ecg heartbeat data (original username etc)
dataset_owner = "shayanfazeli"
dataset_name = "heartbeat"

#Configuring a download path that is NOT in the current github repo (so the big files are not pushed and cause an error!) --> Links to filepaths have to be dynamically adjusted
download_path = "../data/KAGGLE_datasets" #In this case we use the data folder that is in the .gitignore list and therefore not pushed! To keep everything in one local repo.

# Download structure: First check if dataset is already downloaded, else download it and store it in download path (should be outside git repo!)
dataset_folder = os.path.join(download_path, dataset_name)
if not os.path.exists(dataset_folder):
    # Case 1: Dataset path is not created --> Create it and download datasets into it
    api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True)
    print("Datasets are downloaded and unzipped.")
else:
    # Case 2: Folder is created, but datasets might be missing
    missing_files = [] 
    for file_name in ["mitbih_test.csv", "mitbih_train.csv", "ptbdb_abnormal.csv", "ptbdb_normal.csv"]:  # These are the hardcoded names of the datasets that should be downloaded.
        file_path = os.path.join(dataset_folder, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)

    if missing_files:
        # If the list contains missing files, download ALL files and overwrite the old folder.
        api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True, force=True)
        print("Missing data was donwloaded and unzipped. All Datasets are now available.")
    else:
        print("All Datasets are already available.")

#Creating new variable that links to the datasets and can be used in the rest of the code.
path_to_datasets = download_path + "/" + dataset_name 

All Datasets are already available.


In [84]:
np.set_printoptions(precision=4)

In [83]:
# This cell now makes use of the downloadfolder for the datasets. If already available locally, the filepaths can be changed.
df_train= pd.read_csv(path_to_datasets + "/" + 'mitbih_train.csv', header=None)
df_test=pd.read_csv(path_to_datasets + "/" +  'mitbih_test.csv',header=None)

#split target and value
train_target=df_train[187]
test_target=df_test[187]
train=df_train.drop(187,axis=1)
test=df_test.drop(187,axis=1)

In [85]:
class Config:
    oversample = False #refers to mitbih B_SMOTE
    undersample = False #Refers to undersampling with random undersampler
    sample_name = "UNDEFINED_SAMPLE"

Train_SVM =  True #trains the SVM Model without Gridsearch
Train_KNN = True #trains the KNN Model without Gridsearch
Train_DTC = True #trains the DTC Model without Gridsearch
Train_RF = True #trains the RF Model without Gridsearch
Train_XGB = True #trains the XGB Model without Gridsearch


In [86]:
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [87]:
if Config.oversample:
    train, train_target = oversampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_B_SMOTE"
    print("Sample Name:", Config.sample_name)
elif Config.undersample:
    train, train_target = undersampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
    Config.sample_name = "MITBIH_C_RUS"
    print("Sample Name:", Config.sample_name)
else: 
    print("Using the original mitbih dataset")
    Config.sample_name = "MITBIH_A_Original"
    print("Sample Name:", Config.sample_name)

Using the original mitbih dataset
Sample Name: MITBIH_A_Original


In [96]:
#Function to save models and classification report directly after running.
import os
import pickle

def save_model_and_report(model, report, model_filename, report_filename, model_folder="../models/ML_Models", report_folder="../reports/figures/ML_Models"):
    # Check if model file size is greater than 98MB
    model_savepath = os.path.join(model_folder, model_filename)
    if os.path.getsize(model_savepath) > 98 * 1024 * 1024:  # Check if size is greater than 98MB
        print("Model file size is too big. Changing save path...")
        model_folder = "../data/models_too_big_for_git"
        os.makedirs(model_folder, exist_ok=True)  # Create the directory if it doesn't exist
        model_savepath = os.path.join(model_folder, model_filename)

    # Save the model
    with open(model_savepath, "wb") as f:
        pickle.dump(model, f)
    print(f"The model was saved as {model_filename} in folder {model_folder}.")

    # Save the classification report
    report_savepath = os.path.join(report_folder, report_filename)
    with open(report_savepath, "w") as f:
        f.write(report)
    print(f"The classification report was saved as {report_filename} in folder {report_folder}.")



In [94]:
train.shape

(87554, 187)

# **SVM**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [95]:
#Just the code for model creation, fitting and creating the report out of the predictions.
if Train_SVM == True:
    model = SVC(cache_size=500)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the SVM Model
    model_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"SVM_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("SVM Model withoud gridsearch is not trained")

SVM Model withoud gridsearch is not trained


# **KNN**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [91]:
if Train_KNN == True:
    model = KNN(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the KNN Model
    model_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"KNN_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("KNN Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9777    0.9946    0.9861     18118
         1.0     0.8970    0.6421    0.7484       556
         2.0     0.9395    0.9012    0.9200      1448
         3.0     0.7630    0.6358    0.6936       162
         4.0     0.9941    0.9509    0.9720      1608

    accuracy                         0.9736     21892
   macro avg     0.9143    0.8249    0.8640     21892
weighted avg     0.9727    0.9736    0.9725     21892

The model was saved as KNN_Basemodel_no_gridsearch_MITBIH_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as KNN_Basemodel_no_gridsearch_MITBIH_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **Decision Tree**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [90]:
if Train_DTC == True:
    model = DTC()
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Decision Tree Model
    model_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"DTC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Decision Tree Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9754    0.9743    0.9748     18118
         1.0     0.6374    0.6259    0.6316       556
         2.0     0.8543    0.8667    0.8605      1448
         3.0     0.5690    0.6111    0.5893       162
         4.0     0.9396    0.9384    0.9390      1608

    accuracy                         0.9530     21892
   macro avg     0.7951    0.8033    0.7990     21892
weighted avg     0.9532    0.9530    0.9531     21892

The model was saved as DTC_Basemodel_no_gridsearch_MITBIH_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as DTC_Basemodel_no_gridsearch_MITBIH_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **Random Forest**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [92]:
if Train_RF == True:
    model = RFC(n_jobs = -1)
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the Random Forest Model
    model_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"RFC_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("Random Forest Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9724    0.9992    0.9857     18118
         1.0     0.9882    0.6043    0.7500       556
         2.0     0.9822    0.8778    0.9271      1448
         3.0     0.8814    0.6420    0.7429       162
         4.0     0.9961    0.9434    0.9690      1608

    accuracy                         0.9744     21892
   macro avg     0.9641    0.8133    0.8749     21892
weighted avg     0.9746    0.9744    0.9728     21892

The model was saved as RFC_Basemodel_no_gridsearch_MITBIH_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as RFC_Basemodel_no_gridsearch_MITBIH_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.


# **XGBoost**

<font color='red'>PUT SOME NOTES / EXPLANATORY WORDS AS TEAM HERE IF NEEDED</font>

In [93]:
if Train_XGB == True:
    model = XGB.XGBClassifier(objective='binary:logistic')
    model.fit(train,train_target)
    predictions = model.predict(test)
    report=classification_report(test_target, predictions, digits=4)
    print(report)
    #Calling the save_model_and_report function for the XGB Model
    model_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}.pkl"
    report_filename = f"XGB_Basemodel_no_gridsearch_{Config.sample_name}_classification_report.txt"
    save_model_and_report(model, report, model_filename, report_filename)
else:
    print("XBG Model without gridsearch is not trained.")

              precision    recall  f1-score   support

         0.0     0.9821    0.9976    0.9898     18118
         1.0     0.9406    0.6835    0.7917       556
         2.0     0.9724    0.9240    0.9476      1448
         3.0     0.8788    0.7160    0.7891       162
         4.0     0.9924    0.9726    0.9824      1608

    accuracy                         0.9808     21892
   macro avg     0.9532    0.8587    0.9001     21892
weighted avg     0.9804    0.9808    0.9799     21892

The model was saved as XGB_Basemodel_no_gridsearch_MITBIH_A_Original.pkl in folder ../models/ML_Models.
The classification report was saved as XGB_Basemodel_no_gridsearch_MITBIH_A_Original_classification_report.txt in folder ../reports/figures/ML_Models.
