In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as XGB

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
## Test cell for simon: Using Kaggle API to download the datasets indepent of github and its filesize limitations. Storing it in folder located outside of the repo.
# If this works, all filepaths have to be adjusted in all notebooks to make use of the downloaded datasets.
#RUN THIS CELL ONLY ONCE FOR ALL NOTEBOOKS!

from kaggle.api.kaggle_api_extended import KaggleApi

#configuring and authentification with kaggle api. This could be configured so that a authentification mask is shown?
api = KaggleApi()
api.authenticate()

#Configuring the metadata for the ecg heartbeat data (original username etc)
dataset_owner = "shayanfazeli"
dataset_name = "heartbeat"

#Configuring a download path that is NOT in the current github repo (so the big files are not pushed and cause an error!) --> Links to filepaths have to be dynamically adjusted
download_path = "/home/simon/KAGGLE_datasets"

# Download structure: First check if dataset is already downloaded, else download it and store it in download path (should be outside git repo!)
dataset_folder = os.path.join(download_path, dataset_name)
if not os.path.exists(dataset_folder):
    # Case 1: Dataset path is not created --> Create it and download datasets into it
    api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True)
    print("Datasets are downloaded and unzipped.")
else:
    # Case 2: Folder is created, but datasets might be missing
    missing_files = [] 
    for file_name in ["mitbih_test.csv", "mitbih_train.csv", "ptbdb_abnormal.csv", "ptbdb_normal.csv"]:  # These are the hardcoded names of the datasets that should be downloaded.
        file_path = os.path.join(dataset_folder, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)

    if missing_files:
        # If the list contains missing files, download ALL files and overwrite the old folder.
        api.dataset_download_files(dataset_owner + "/" + dataset_name, path=download_path + "/" + dataset_name, unzip=True, force=True)
        print("Missing data was donwloaded and unzipped. All Datasets are now available.")
    else:
        print("All Datasets are already available.")

#Creating new variable that links to the datasets and can be used in the rest of the code.
path_to_datasets = download_path + "/" + dataset_name 

All Datasets are already available.


In [12]:
np.set_printoptions(precision=4)

In [13]:
# This cell now makes use of the downloadfolder for the datasets. If already available locally, the filepaths can be changed.
df_train= pd.read_csv(path_to_datasets + "/" + 'mitbih_train.csv', header=None)
df_test=pd.read_csv(path_to_datasets + "/" +  'mitbih_test.csv',header=None)

#split target and value
train_target=df_train[187]
test_target=df_test[187]
train=df_train.drop(187,axis=1)
test=df_test.drop(187,axis=1)

In [4]:
class Config:
    oversample = True
    undersample = False

In [5]:
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [6]:
if Config.oversample:
    train, train_target = oversampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
elif Config.undersample:
    train, train_target = undersampler.fit_resample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
else: 
    print("Using the original mitbih dataset")

In [7]:
train.shape

(362355, 187)

# **SVM**

In [8]:
model = SVC(cache_size=500)

In [9]:
model.fit(train,train_target)

In [10]:
#model.score(train,train_target)

In [11]:
predictions = model.predict(test)

In [12]:
report=classification_report(test_target, predictions, digits=4)
print(report)

              precision    recall  f1-score   support

         0.0     0.9912    0.9367    0.9632     18118
         1.0     0.4225    0.8237    0.5585       556
         2.0     0.8861    0.9351    0.9099      1448
         3.0     0.2780    0.9198    0.4269       162
         4.0     0.9692    0.9770    0.9731      1608

    accuracy                         0.9366     21892
   macro avg     0.7094    0.9185    0.7663     21892
weighted avg     0.9629    0.9366    0.9462     21892



# **KNN**

In [13]:
model = KNN(n_jobs = -1)

In [14]:
model.fit(train,train_target)

In [15]:
model.score(train,train_target)

0.993970001793821

In [16]:
predictions = model.predict(test)

In [17]:
report=classification_report(test_target, predictions, digits=4)
print(report)

              precision    recall  f1-score   support

         0.0     0.9925    0.9593    0.9756     18118
         1.0     0.5067    0.8165    0.6253       556
         2.0     0.8704    0.9461    0.9067      1448
         3.0     0.4892    0.8395    0.6182       162
         4.0     0.9614    0.9751    0.9682      1608

    accuracy                         0.9551     21892
   macro avg     0.7640    0.9073    0.8188     21892
weighted avg     0.9660    0.9551    0.9590     21892



# **Decision Tree**

In [18]:
model = DTC()

In [19]:
model.fit(train,train_target)

In [20]:
predictions = model.predict(test)

In [21]:
report=classification_report(test_target, predictions, digits=4)
print(report)

              precision    recall  f1-score   support

         0.0     0.9821    0.9516    0.9666     18118
         1.0     0.4743    0.7140    0.5700       556
         2.0     0.8030    0.8978    0.8477      1448
         3.0     0.4721    0.6790    0.5570       162
         4.0     0.9314    0.9546    0.9429      1608

    accuracy                         0.9402     21892
   macro avg     0.7326    0.8394    0.7768     21892
weighted avg     0.9499    0.9402    0.9439     21892



# **Random Forest**

In [22]:
model = RFC(n_jobs = -1)

In [23]:
model.fit(train,train_target)

In [24]:
model.score(train,train_target)

1.0

In [25]:
predictions = model.predict(test)

In [26]:
report=classification_report(test_target, predictions, digits=4)
print(report)

              precision    recall  f1-score   support

         0.0     0.9860    0.9939    0.9899     18118
         1.0     0.8707    0.7752    0.8202       556
         2.0     0.9652    0.9385    0.9517      1448
         3.0     0.8079    0.7531    0.7796       162
         4.0     0.9892    0.9695    0.9793      1608

    accuracy                         0.9811     21892
   macro avg     0.9238    0.8860    0.9041     21892
weighted avg     0.9806    0.9811    0.9808     21892



# **XGBoost**

In [27]:
model = XGB.XGBClassifier(objective='binary:logistic')

In [28]:
model.fit(train,train_target)

In [29]:
model.score(train,train_target)

0.9952725917953388

In [30]:
predictions = model.predict(test)

In [31]:
report=classification_report(test_target, predictions, digits=4)
print(report)

              precision    recall  f1-score   support

         0.0     0.9894    0.9758    0.9825     18118
         1.0     0.6134    0.8076    0.6972       556
         2.0     0.9318    0.9434    0.9375      1448
         3.0     0.5877    0.8272    0.6872       162
         4.0     0.9850    0.9782    0.9816      1608

    accuracy                         0.9684     21892
   macro avg     0.8214    0.9064    0.8572     21892
weighted avg     0.9727    0.9684    0.9700     21892

