# ECG classification

## Imports

In [2]:
import sys
sys.path.append("../itershap")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from itershap import IterSHAP

## Get ECG data

In [6]:
ecg_columns = ['bin '+str(i) + '-'+str(i+50) for i in range(200, 1700, 50)]
# print(ecg_columns)


def balance_binary_dataset(X, y):
    # Calculate class distribution
    class_counts = np.bincount(y)

    # Identify majority class
    majority_class = 0
    minority_class = 1

    # Randomly downsample majority class
    minority_class_count = class_counts[1 - majority_class]
    majority_class_indices = np.where(y == majority_class)[0]
    downsampled_indices = np.random.choice(
        majority_class_indices, size=int(minority_class_count), replace=False
    )

    # Combine downsampled majority class samples with all minority class samples
    downsampled_X = np.concatenate((X.iloc[downsampled_indices], X.iloc[y == minority_class]), axis=0)
    downsampled_X = pd.DataFrame(downsampled_X, columns=ecg_columns)

    downsampled_y = np.concatenate((y[downsampled_indices], y[y == minority_class]), axis=0)

    return downsampled_X, downsampled_y


def load_ecg_data(PERC_DATA_USED):
    filepath = "../data/ecg/Preprocessed_AFData.csv"
    f = open(filepath)
    attributes=f.readline()
    X = []
    y = []
    for line in f:
        line = line.rstrip().split(',')
        l = [float(i) for i in line]
        X.append(l[:-1])
        y.append(l[-1])

    X = np.asarray(X)
    y = np.asarray([round(k) for k in y])
    X = pd.DataFrame(X, columns=ecg_columns)

    # Balance the dataset and return
    X, y = balance_binary_dataset(X, y)

    X, X_not_used, y, y_not_used = train_test_split(X, y, train_size=PERC_DATA_USED)

    return X, y

# load_ecg_data(0.01)

## Run IterSHAP

In [None]:
# Change this percentage to use more are less data
PERC_DATA_USED = 0.01

# Load data from the data folder
X, y = load_ecg_data(PERC_DATA_USED)

# Create a data copy to test model performance without feature selection 
X_without_fs = pd.DataFrame(X)

# Check the current shape of the dataset
print(X.shape)

# Create and fit IterSHAP using a RandomForestClassifier (default)
itershap_fs = IterSHAP()
itershap_fs.fit(X, y)

# Transform the input data to only include selected features and print its shape
X = itershap_fs.transform()
print(X.shape)

## Run model with and without feature selection

#### Without feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_without_fs, y, test_size=0.25)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

#### With feature selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print(accuracy)