In [1]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [3]:
#2
import numpy as np
import pandas as pd


def equal_width_binning(data, n_bins=4):
    """
    Convert continuous numeric values into categorical bins using equal width binning.

    Parameters:
        data (array-like): Continuous numeric values.
        n_bins (int): Number of bins to divide the data into.

    Returns:
        np.ndarray: Binned categorical values (0..n_bins-1).
    """
    data = np.array(data, dtype=float)
    min_val, max_val = data.min(), data.max()

    # Edge case: if all values are the same
    if min_val == max_val:
        return np.zeros_like(data, dtype=int)

    bin_width = (max_val - min_val) / n_bins
    bins = np.floor((data - min_val) / bin_width).astype(int)

    # Handle edge case where max value goes out of range
    bins[bins == n_bins] = n_bins - 1
    return bins


def get_probabilities(data):
    """
    Compute probability distribution of categorical values.

    Parameters:
        data (array-like): Input categorical values.

    Returns:
        np.ndarray: Probabilities of each unique category.
    """
    values, counts = np.unique(data, return_counts=True)
    probabilities = counts / counts.sum()
    return probabilities


def calculate_gini(data, is_continuous=False, n_bins=4):
    """
    Calculate Gini Index for dataset column.

    Parameters:
        data (array-like): Input values (categorical or continuous).
        is_continuous (bool): If True, apply equal-width binning first.
        n_bins (int): Number of bins if continuous.

    Returns:
        float: Gini Index value.
    """
    if is_continuous:
        data = equal_width_binning(data, n_bins=n_bins)

    probabilities = get_probabilities(data)
    gini = 1 - np.sum(probabilities ** 2)
    return gini


def demo():
    """Demonstration with toy data and dataset column."""
    # ---------------- Example 1: Toy continuous data ----------------
    y_continuous = [2.5, 3.6, 1.2, 7.8, 6.5, 9.0, 4.3]
    gini_cont = calculate_gini(y_continuous, is_continuous=True, n_bins=4)
    print("Gini Index (toy continuous data, 4 bins):", gini_cont)

    # ---------------- Example 2: Toy categorical data ----------------
    y_categorical = ["A", "B", "A", "A", "B", "C"]
    gini_cat = calculate_gini(y_categorical, is_continuous=False)
    print("Gini Index (toy categorical data):", gini_cat)

    # ---------------- Example 3: PTB-XL dataset column ----------------
    try:
        df = pd.read_csv("ptbxl_database.csv")
        gini_age = calculate_gini(df['age'], is_continuous=True, n_bins=4)
        print("Gini Index for PTB-XL age column:", gini_age)

        gini_sex = calculate_gini(df['sex'], is_continuous=False)
        print("Gini Index for PTB-XL sex column:", gini_sex)
    except FileNotFoundError:
        print("ptbxl_database.csv not found. Upload the dataset to run this example.")


if __name__ == "__main__":
    demo()


Gini Index (toy continuous data, 4 bins): 0.7346938775510204
Gini Index (toy categorical data): 0.6111111111111112
ptbxl_database.csv not found. Upload the dataset to run this example.
