In [None]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("sadmansakib7/ecg-arrhythmia-classification-dataset")

print("Path to dataset files:", path)

# load the dataset
df = pd.read_csv(path + "\MIT-BIH Arrhythmia Database.csv")     


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# data visualization and preprocessing

# bar chart of class distribution
df['type'].value_counts().plot(kind='bar')
plt.show()

# Data exploration
print(f'Dataset Shape: {df.shape}\n')
print(df.head(5))
df.info()
df.describe()

#columns in dataset:
print(df.columns)
print(df['type'].value_counts()) 

# from the visualization, we can see that there are some classes with very few samples, and classes that don't do anything
df = df.drop(columns=['record'])

# drop type ==  Q, not enough samples to form worthwhile predictions, can't be saved with SMOTE either
df = df[df['type'] != 'Q']

# Encode the 'type' column,  because the classes are not ordinal
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])

# Print the mapping of classes to encoded values
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


X = df.drop('type', axis=1)
y = df['type']

# Split the data into features (X) and target (y)
# X contains all columns except 'type', which is our target variable
# y contains only the 'type' column which has been encoded to numeric values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Initialize XGBoost classifier with specific parameters
# use_label_encoder=False: Avoid using the deprecated label encoder
# eval_metric='mlogloss': Use multiclass log loss as evaluation metric
# verbosity=0: Suppress verbose output
# random_state=42: Set seed for reproducibility
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0, random_state=42)

# Train the XGBoost model on the training data
xgb_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Print the classification report
print("XGBoost Classification Report:")
# Use the original class names from the label encoder for better readability in the report
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

XGBoost Classification Report:
              precision    recall  f1-score   support

           F       0.95      0.80      0.87       163
           N       0.99      1.00      1.00     18017
        SVEB       0.97      0.87      0.92       573
         VEB       0.97      0.97      0.97      1382

    accuracy                           0.99     20135
   macro avg       0.97      0.91      0.94     20135
weighted avg       0.99      0.99      0.99     20135



In [12]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest classifier with specific parameters
# n_estimators=100: Use 100 trees in the forest
# random_state=42: Set seed for reproducibility
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model on the training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_scaled)

# Print the classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

Random Forest Classification Report:
              precision    recall  f1-score   support

           F       0.94      0.71      0.81       163
           N       0.99      1.00      0.99     18017
        SVEB       0.98      0.80      0.88       573
         VEB       0.97      0.96      0.96      1382

    accuracy                           0.99     20135
   macro avg       0.97      0.87      0.91     20135
weighted avg       0.99      0.99      0.99     20135



In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=2000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate the model
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           F       0.68      0.22      0.33       163
           N       0.96      0.99      0.97     18017
        SVEB       0.51      0.17      0.25       573
         VEB       0.86      0.80      0.83      1382

    accuracy                           0.95     20135
   macro avg       0.75      0.54      0.60     20135
weighted avg       0.94      0.95      0.94     20135



In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# Create a VotingClassifier
voting_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('lr', lr_model)
    ],
    voting='hard'  # 'hard' for majority voting, 'soft' for averaging probabilities
)

# Train the ensemble model
voting_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_voting = voting_model.predict(X_test_scaled)

# Evaluate the ensemble model
print("Voting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting, target_names=label_encoder.classes_))

Voting Classifier Classification Report:
              precision    recall  f1-score   support

           F       0.94      0.72      0.82       163
           N       0.99      1.00      0.99     18017
        SVEB       0.99      0.80      0.89       573
         VEB       0.97      0.96      0.97      1382

    accuracy                           0.99     20135
   macro avg       0.97      0.87      0.92     20135
weighted avg       0.99      0.99      0.99     20135

