In [None]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("sadmansakib7/ecg-arrhythmia-classification-dataset")

print("Path to dataset files:", path)

# load the dataset
df = pd.read_csv(path + "\MIT-BIH Arrhythmia Database.csv")     


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# This code block is for the data exploration and visualization

# N (Normal):
# Description: Represents normal heartbeats. These are the most common and indicate a regular, healthy heartbeat pattern.
# Count: 90,083 instances in your dataset, indicating that normal heartbeats are the majority class.
# VEB (Ventricular Ectopic Beat):
# Description: These are premature heartbeats originating from the ventricles.
# Count: 7,009 instances, making it a minority class compared to normal beats.
# SVEB (Supraventricular Ectopic Beat):
# Description: These are premature heartbeats originating above the ventricles, often in the atria.
# Count: 2,779 instances, another minority class.
# F (Fusion Beat):
# Description: Fusion beats occur when a normal heartbeat and an ectopic beat occur at the same time
# Count: 803 instances, indicating it's a relatively rare occurrence in your dataset.
# Q (Unknown/Unclassified):
# Description: This category might represent beats that couldn't be classified into the other categories
# Count: 15 instances, making it the rarest class in your dataset, should be removed

# data visualization and preprocessing
# bar chart of class distribution
df['type'].value_counts().plot(kind='bar')
plt.show()

# Data exploration
print(f'Dataset Shape: {df.shape}\n')
print(df.head(5))
df.info()
df.describe()

#columns in dataset:
print(df.columns)
print(df['type'].value_counts()) 

# from the visualization, we can see that there are some classes with very few samples, and classes that don't do anything
df = df.drop(columns=['record'])

# drop type ==  Q, not enough samples to form worthwhile predictions
df = df[df['type'] != 'Q']

# Encode the 'type' column,  because the classes are not ordinal
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])

# Print the mapping of classes to encoded values
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Split the data into features (X) and target (y)
# X contains all columns except 'type', which is our target variable
# y contains only the 'type' column which has been encoded to numeric values
X = df.drop('type', axis=1)
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training data, to balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)



In [None]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# This code block is foor all the individual baseline models

# Initialize XGBoost classifier with specific parameters
# use_label_encoder=False: Avoid using the deprecated label encoder
# eval_metric='mlogloss': Use multiclass log loss as evaluation metric
# verbosity=0: Suppress verbose output
# random_state=42: Set seed for reproducibility
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0, random_state=42)

# Train the XGBoost model on the training data
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Print the classification report
print("XGBoost Classification Report:")
# Use the original class names from the label encoder for better readability in the report
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

# Initialize Random Forest classifier with specific parameters
# n_estimators=100: Use 100 trees in the forest
# random_state=42: Set seed for reproducibility
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the Random Forest model on the training data
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_scaled)

# Print the classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

# Initialize the SVM classifier with RBF kernel for non-linear data
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the SVM model on the resampled training data
svm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate the model
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# This code block is for all the ensemble models
# Create a VotingClassifier
voting_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('svm', svm_model)
    ],
    voting='hard'  # 'hard' for majority voting, 'soft' for averaging probabilities
)

# Train the ensemble model
voting_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_voting = voting_model.predict(X_test_scaled)

# Evaluate the ensemble model
print("Voting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting, target_names=label_encoder.classes_))