In [None]:
# MLP

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from lightgbm import LGBMClassifier
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')

# Set up Google Drive and output directories
def setup_output_dirs():
    base_dir = '/content/drive/MyDrive/ECG_Results/'
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    output_dir = os.path.join(base_dir, timestamp)
    csv_dir = os.path.join(output_dir, 'CSVs')
    plot_dir = os.path.join(output_dir, 'Plots')

    os.makedirs(csv_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)

    return output_dir, csv_dir, plot_dir, timestamp

# Load the dataset
def load_data():
    data = pd.read_csv('/content/PQRST_Complexes_and_Features_final.csv')
    return data

# Handle missing values with domain-specific imputation
def impute_missing_values(df):
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    print("Columns with NaN before imputation:")
    print(df[numerical_cols].isna().sum()[df[numerical_cols].isna().sum() > 0])

    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            for diag_class in df['diagnostic_single_5_superclass'].unique():
                mask = df['diagnostic_single_5_superclass'] == diag_class
                if df[mask][col].notna().sum() > 0:
                    median_val = df[mask][col].median()
                else:
                    median_val = df[col].median()
                df.loc[mask & df[col].isnull(), col] = median_val

    for col in categorical_cols:
        if df[col].isnull().sum() > 0 and col != 'rr_intervals':
            df[col].fillna(df[col].mode()[0], inplace=True)

    df['rr_intervals'] = df['rr_intervals'].fillna(df['rr_intervals'].mode()[0])

    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    print("Columns with NaN after imputation:")
    print(df[numerical_cols].isna().sum()[df[numerical_cols].isna().sum() > 0])

    return df

# Advanced feature engineering
def feature_engineering(df):
    def parse_rr_intervals(x):
        try:
            return [float(i) for i in x.split(',')]
        except:
            return [1.0]

    df['rr_intervals_list'] = df['rr_intervals'].apply(parse_rr_intervals)

    df['rr_mean'] = df['rr_intervals_list'].apply(lambda x: np.mean(x) if len(x) > 0 else 1.0)
    df['rr_std'] = df['rr_intervals_list'].apply(lambda x: np.std(x) if len(x) > 1 else 0.0)
    df['rr_min'] = df['rr_intervals_list'].apply(lambda x: np.min(x) if len(x) > 0 else 1.0)
    df['rr_max'] = df['rr_intervals_list'].apply(lambda x: np.max(x) if len(x) > 0 else 1.0)
    df['rr_range'] = df['rr_max'] - df['rr_min']
    df['rr_skew'] = df['rr_intervals_list'].apply(lambda x: skew(x) if len(x) > 2 else 0.0)
    df['rr_kurtosis'] = df['rr_intervals_list'].apply(lambda x: kurtosis(x) if len(x) > 3 else 0.0)

    df['qtc_pr_ratio'] = df['qtc_bazett_ms'] / df['pr_interval_ms'].replace(0, 1e-6)
    df['p_qrs_interaction'] = df['p_amplitude_mv'] * df['qrs_duration_ms']
    df['r_s_ratio'] = df['r_amplitude_mv'] / (df['s_amplitude_mv'].abs() + 1e-6)
    df['heart_rate_variability'] = df['rr_std'] / (df['rr_mean'] + 1e-6)
    df['t_qrs_ratio'] = df['t_amplitude_mv'] / (df['r_amplitude_mv'].abs() + 1e-6)
    df['st_slope_proxy'] = df['st_elevation_mv'] / (df['t_duration_ms'].replace(0, 1e-6))
    df['qrs_axis_proxy'] = df['r_amplitude_mv'] / (df['s_amplitude_mv'].abs() + 1e-6)
    df['t_asymmetry'] = df['t_amplitude_mv'].abs() / (df['t_duration_ms'].replace(0, 1e-6))
    df['st_t_ratio'] = df['st_elevation_mv'] / (df['t_amplitude_mv'].abs() + 1e-6)
    df['qrs_t_angle_proxy'] = df['qrs_axis_proxy'] * df['t_amplitude_mv']
    df['st_t_asymmetry'] = df['st_elevation_mv'] / (df['t_asymmetry'] + 1e-6)
    df['qrs_morphology'] = df['qrs_duration_ms'] / (df['r_amplitude_mv'].abs() + 1e-6)
    df['qrs_s_morphology'] = df['qrs_duration_ms'] / (df['s_amplitude_mv'].abs() + 1e-6)
    df['t_inversion_proxy'] = abs(df['t_amplitude_mv'] - df.groupby('diagnostic_single_5_superclass')['t_amplitude_mv'].transform('mean'))
    df['pr_qrs_interaction'] = df['pr_interval_ms'] * df['qrs_duration_ms']

    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    engineered_cols = ['rr_mean', 'rr_std', 'rr_min', 'rr_max', 'rr_range', 'rr_skew', 'rr_kurtosis',
                       'qtc_pr_ratio', 'p_qrs_interaction', 'r_s_ratio', 'heart_rate_variability',
                       't_qrs_ratio', 'st_slope_proxy', 'qrs_axis_proxy', 't_asymmetry',
                       'st_t_ratio', 'qrs_t_angle_proxy', 'st_t_asymmetry', 'qrs_morphology',
                       'qrs_s_morphology', 't_inversion_proxy', 'pr_qrs_interaction']
    for col in engineered_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    df.drop('rr_intervals_list', axis=1, inplace=True)

    print("Columns with NaN after feature engineering:")
    print(df[engineered_cols].isna().sum()[df[engineered_cols].isna().sum() > 0])

    return df

# Prepare data for ML
def prepare_data(df):
    le = LabelEncoder()
    df['target'] = le.fit_transform(df['diagnostic_single_5_superclass'])

    exclude_cols = ['ecg_id', 'patient_id', 'filename_hr', 'scp_codes',
                    'diagnostic_superclass', 'diagnostic_single_5_superclass',
                    'rr_intervals', 'target', 'strat_fold']
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    X = df[feature_cols]
    y = df['target']
    folds = df['strat_fold']

    nan_cols = X.isna().sum()[X.isna().sum() > 0]
    if len(nan_cols) > 0:
        print("Warning: NaN values found in features before scaling:")
        print(nan_cols)
        for col in nan_cols.index:
            X[col].fillna(X[col].median(), inplace=True)

    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, folds, le, feature_cols, df

# Perform 10-fold cross-validation using strat_fold
def train_and_evaluate(X, y, folds, le, feature_cols, df, output_dir, csv_dir, plot_dir, timestamp):
    f1_scores = []
    macro_f1_scores = []
    accuracies = []
    unique_folds = sorted(np.unique(folds))

    # Store actual vs predicted for all samples and overall confusion matrix
    all_predictions = []
    all_y_true = []
    all_y_pred = []

    # Metrics storage
    metrics_data = []

    with open(os.path.join(output_dir, 'lightgbm_cv_results.txt'), 'w') as f:
        for fold in unique_folds:
            print(f"\nFold {fold}:")
            f.write(f"\nFold {fold}:\n")

            train_idx = folds != fold
            test_idx = folds == fold

            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            df_test = df.loc[test_idx].copy()

            # Print class distribution for debugging
            print(f"Class distribution in y_train (Fold {fold}):")
            class_counts = pd.Series(y_train).value_counts()
            for idx, count in class_counts.items():
                print(f"Class {le.classes_[idx]} (Encoded: {idx}): {count}")
            f.write(f"Class distribution in y_train (Fold {fold}):\n{class_counts}\n")

            # Define sampling strategy
            target_samples = 3000
            sampling_strategy = {
                le.transform(['HYP'])[0]: target_samples,
                le.transform(['MI'])[0]: target_samples,
                le.transform(['CD'])[0]: target_samples,
                le.transform(['STTC'])[0]: target_samples
            }

            # Ensure target samples are at least the original count
            for class_idx in sampling_strategy:
                original_count = pd.Series(y_train).value_counts().get(class_idx, 0)
                sampling_strategy[class_idx] = max(target_samples, original_count)

            print(f"SMOTE sampling strategy (Fold {fold}): {sampling_strategy}")
            f.write(f"SMOTE sampling strategy (Fold {fold}): {sampling_strategy}\n")

            # Apply SMOTE + Tomek Links + RandomUnderSampler
            smote_tomek = SMOTETomek(sampling_strategy=sampling_strategy, random_state=42)
            under = RandomUnderSampler(sampling_strategy={le.transform(['NORM'])[0]: 6000}, random_state=42)
            pipeline = Pipeline([('smote_tomek', smote_tomek), ('under', under)])
            X_train_res, y_train_res = pipeline.fit_resample(X_train, y_train)

            # Print resampled class distribution
            print(f"Class distribution after SMOTE+Tomek+UnderSampler (Fold {fold}):")
            resampled_counts = pd.Series(y_train_res).value_counts()
            for idx, count in resampled_counts.items():
                print(f"Class {le.classes_[idx]} (Encoded: {idx}): {count}")
            f.write(f"Class distribution after SMOTE+Tomek+UnderSampler (Fold {fold}):\n{resampled_counts}\n")

            # Apply RFE for feature selection
            lgbm = LGBMClassifier(
                objective='multiclass',
                num_class=len(le.classes_),
                metric='multi_logloss',
                random_state=42,
                max_depth=12,
                learning_rate=0.05,
                n_estimators=500,
                num_leaves=150,
                device='gpu'
            )
            rfe = RFE(lgbm, n_features_to_select=15)
            X_train_rfe = rfe.fit_transform(X_train_res, y_train_res)
            X_test_rfe = rfe.transform(X_test)
            selected_features = np.array(feature_cols)[rfe.support_].tolist()

            # Apply polynomial features
            poly = PolynomialFeatures(degree=2, interaction_only=True)
            X_train_rfe = poly.fit_transform(X_train_rfe)
            X_test_rfe = poly.transform(X_test_rfe)

            # Compute class weights
            class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
            class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

            # Define LightGBM with class weights
            model = LGBMClassifier(
                objective='multiclass',
                num_class=len(le.classes_),
                metric='multi_logloss',
                random_state=42,
                max_depth=12,
                learning_rate=0.05,
                n_estimators=500,
                num_leaves=150,
                device='gpu',
                class_weight=class_weight_dict
            )

            model.fit(X_train_rfe, y_train_res)

            y_pred = model.predict(X_test_rfe)

            # Collect predictions for overall confusion matrix
            all_y_true.extend(y_test)
            all_y_pred.extend(y_pred)

            report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
            print(f"Classification Report for Fold {fold}:")
            print(classification_report(y_test, y_pred, target_names=le.classes_))
            f.write(f"Classification Report for Fold {fold}:\n{classification_report(y_test, y_pred, target_names=le.classes_)}\n")

            # Confusion matrix per fold
            cm = confusion_matrix(y_test, y_pred)
            cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
            cm_df.to_csv(os.path.join(csv_dir, f'confusion_matrix_fold_{fold}.csv'))
            print(f"Confusion Matrix for Fold {fold}:\n{cm}")
            f.write(f"Confusion Matrix for Fold {fold}:\n{cm}\n")

            # Plot confusion matrix per fold
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
            plt.title(f'Confusion Matrix - Fold {fold}')
            plt.ylabel('Actual')
            plt.xlabel('Predicted')
            plt.savefig(os.path.join(plot_dir, f'confusion_matrix_fold_{fold}.png'), dpi=300)
            plt.close()

            # Store actual vs predicted
            fold_predictions = df_test[['ecg_id']].copy()
            fold_predictions['actual'] = le.inverse_transform(y_test)
            fold_predictions['predicted'] = le.inverse_transform(y_pred)
            fold_predictions['fold'] = fold
            all_predictions.append(fold_predictions)

            # Metrics
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            macro_f1 = f1_score(y_test, y_pred, average='macro')
            f1_scores.append(f1)
            macro_f1_scores.append(macro_f1)
            accuracies.append(acc)

            fold_metrics = {
                'timestamp': timestamp,
                'fold': fold,
                'accuracy': acc,
                'f1_weighted': f1,
                'macro_f1': macro_f1
            }
            for cls in le.classes_:
                if cls in report:
                    fold_metrics[f'{cls}_precision'] = report[cls]['precision']
                    fold_metrics[f'{cls}_recall'] = report[cls]['recall']
                    fold_metrics[f'{cls}_f1'] = report[cls]['f1-score']
            metrics_data.append(fold_metrics)

            print(f"Accuracy: {acc:.3f}")
            print(f"F1-weighted score: {f1:.3f}")
            print(f"Macro F1-score: {macro_f1:.3f}")
            f.write(f"Accuracy: {acc:.3f}\n")
            f.write(f"F1-weighted score: {f1:.3f}\n")
            f.write(f"Macro F1-score: {macro_f1:.3f}\n")

            # Feature importance (first fold only)
            if fold == 1:
                feature_importance = model.feature_importances_ / model.feature_importances_.sum()
                importance_df = pd.DataFrame({
                    'feature': selected_features,
                    'importance': feature_importance[:len(selected_features)]
                }).sort_values(by='importance', ascending=False)
                importance_df.to_csv(os.path.join(csv_dir, 'feature_importance.csv'), index=False)

                print("Feature Importance (Top 10):")
                print(importance_df.head(10))
                f.write("Feature Importance (Top 10):\n")
                f.write(importance_df.head(10).to_string() + "\n")

                # Plot feature importance
                plt.figure(figsize=(10, 6))
                sns.barplot(x='importance', y='feature', data=importance_df.head(10))
                plt.title('Top 10 Feature Importance - Fold 1')
                plt.xlabel('Importance')
                plt.ylabel('Feature')
                plt.savefig(os.path.join(plot_dir, 'feature_importance.png'), dpi=300)
                plt.close()

        # Compute and save overall confusion matrix
        overall_cm = confusion_matrix(all_y_true, all_y_pred)
        overall_cm_df = pd.DataFrame(overall_cm, index=le.classes_, columns=le.classes_)
        overall_cm_df.to_csv(os.path.join(csv_dir, 'overall_confusion_matrix.csv'))
        print("\nOverall Confusion Matrix:")
        print(overall_cm)
        f.write("\nOverall Confusion Matrix:\n")
        f.write(str(overall_cm) + "\n")

        # Plot overall confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(overall_cm_df, annot=True, fmt='d', cmap='Blues')
        plt.title('Overall Confusion Matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.savefig(os.path.join(plot_dir, 'overall_confusion_matrix.png'), dpi=300)
        plt.close()

        # Save actual vs predicted
        all_predictions_df = pd.concat(all_predictions)
        all_predictions_df = df.merge(all_predictions_df, on='ecg_id', how='left')
        all_predictions_df.to_csv(os.path.join(csv_dir, 'actual_vs_predicted.csv'), index=False)

        # Save metrics
        metrics_df = pd.DataFrame(metrics_data)
        metrics_df.to_csv(os.path.join(csv_dir, 'metrics_summary.csv'), index=False)

        # Save metadata
        with open(os.path.join(output_dir, 'metadata.txt'), 'w') as meta_f:
            meta_f.write(f"Experiment Timestamp: {timestamp}\n")
            meta_f.write(f"Model: LightGBM\n")
            meta_f.write(f"Dataset Size: {len(df)}\n")
            meta_f.write(f"Number of Features: {len(feature_cols)}\n")
            meta_f.write(f"Classes: {', '.join(le.classes_)}\n")
            meta_f.write(f"Hyperparameters: max_depth=12, num_leaves=150, n_estimators=500, learning_rate=0.05\n")
            meta_f.write(f"Class Imbalance Handling: SMOTE+Tomek Links (target_samples=3000), RandomUnderSampler (NORM=6000)\n")
            meta_f.write(f"Feature Selection: RFE (n_features=15) with PolynomialFeatures (degree=2, interaction_only=True)\n")
            meta_f.write(f"Missing Value Handling: Class-specific median imputation\n")

        print("\nSummary of 10-Fold Cross-Validation:")
        f.write("\nSummary of 10-Fold Cross-Validation:\n")
        print(f"Accuracy scores: {[round(acc, 3) for acc in accuracies]}")
        f.write(f"Accuracy scores: {[round(acc, 3) for acc in accuracies]}\n")
        print(f"Mean Accuracy: {np.mean(accuracies):.3f} (±{np.std(accuracies):.3f})")
        f.write(f"Mean Accuracy: {np.mean(accuracies):.3f} (±{np.std(accuracies):.3f})\n")
        print(f"F1-weighted scores: {[round(score, 3) for score in f1_scores]}")
        f.write(f"F1-weighted scores: {[round(score, 3) for score in f1_scores]}\n")
        print(f"Mean F1-weighted score: {np.mean(f1_scores):.3f} (±{np.std(f1_scores):.3f})")
        f.write(f"Mean F1-weighted score: {np.mean(f1_scores):.3f} (±{np.std(f1_scores):.3f})\n")
        print(f"Macro F1-scores: {[round(score, 3) for score in macro_f1_scores]}")
        f.write(f"Macro F1-scores: {[round(score, 3) for score in macro_f1_scores]}\n")
        print(f"Mean Macro F1-score: {np.mean(macro_f1_scores):.3f} (±{np.std(macro_f1_scores):.3f})")
        f.write(f"Mean Macro F1-score: {np.mean(macro_f1_scores):.3f} (±{np.std(macro_f1_scores):.3f})\n")

    return f1_scores, macro_f1_scores, accuracies

# Main pipeline
def main():
    # Setup output directories
    output_dir, csv_dir, plot_dir, timestamp = setup_output_dirs()

    df = load_data()
    df = impute_missing_values(df)
    df = feature_engineering(df)
    X, y, folds, le, feature_cols, df = prepare_data(df)
    f1_scores, macro_f1_scores, accuracies = train_and_evaluate(X, y, folds, le, feature_cols, df, output_dir, csv_dir, plot_dir, timestamp)

    return f1_scores, macro_f1_scores, accuracies

if __name__ == "__main__":
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    f1_scores, macro_f1_scores, accuracies = main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4287
[LightGBM] [Info] Number of data points in the train set: 16993, number of used features: 17
[LightGBM] [Info] Using GPU Device: NVIDIA L4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.32 MB) transferred to GPU in 0.001097 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -1.933453
[LightGBM] [Info] Start training from score -1.751675
[LightGBM] [Info] Start training from score -1.710473
[LightGBM] [Info] Start training from score -1.041042
[LightGBM] [Info] Start training from score -1.910529
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4047
[LightGBM] [Info] Number of data points in the train set: 16993, number of

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')

# Set up Google Drive and output directories
def setup_output_dirs():
    base_dir = '/content/drive/MyDrive/ECG_Results/'
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    output_dir = os.path.join(base_dir, timestamp)
    csv_dir = os.path.join(output_dir, 'CSVs')

    os.makedirs(csv_dir, exist_ok=True)

    return output_dir, csv_dir, timestamp

# Load the dataset
def load_data():
    data = pd.read_csv('/content/PQRST_Complexes_and_Features_final.csv')
    return data

# Handle missing values with domain-specific imputation
def impute_missing_values(df):
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    print("Columns with NaN before imputation:")
    print(df[numerical_cols].isna().sum()[df[numerical_cols].isna().sum() > 0])

    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            for diag_class in df['diagnostic_single_5_superclass'].unique():
                mask = df['diagnostic_single_5_superclass'] == diag_class
                if df[mask][col].notna().sum() > 0:
                    median_val = df[mask][col].median()
                else:
                    median_val = df[col].median()
                df.loc[mask & df[col].isnull(), col] = median_val

    for col in categorical_cols:
        if df[col].isnull().sum() > 0 and col != 'rr_intervals':
            df[col].fillna(df[col].mode()[0], inplace=True)

    df['rr_intervals'] = df['rr_intervals'].fillna(df['rr_intervals'].mode()[0])

    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    print("Columns with NaN after imputation:")
    print(df[numerical_cols].isna().sum()[df[numerical_cols].isna().sum() > 0])

    return df

# Main function to load data, impute missing values, and save the dataset
def main():
    # Setup output directories
    output_dir, csv_dir, timestamp = setup_output_dirs()

    # Load and process data
    df = load_data()
    df = impute_missing_values(df)

    # Save the dataset with imputed missing values
    output_path = os.path.join(csv_dir, 'PQRST_detection_clean_dataset.csv')
    df.to_csv(output_path, index=False)
    print(f"Imputed dataset saved to: {output_path}")

if __name__ == "__main__":
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Columns with NaN before imputation:
pq_interval_ms     5363
pr_interval_ms     2617
p_amplitude_mv     2515
p_duration_ms      5386
qrs_duration_ms    3270
qt_interval_ms     3767
qtc_bazett_ms      3767
st_elevation_mv    2051
t_amplitude_mv     1513
t_duration_ms      5594
mean_heart_rate       1
dtype: int64
Columns with NaN after imputation:
Series([], dtype: int64)
Imputed dataset saved to: /content/drive/MyDrive/ECG_Results/2025-05-06_13-18-14/CSVs/PQRST_detection_clean_dataset.csv
