In [1]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [5]:
#a4
import numpy as np
import pandas as pd


# ---------- Step 1: Flexible Binning Function ----------
def bin_data(data, n_bins=4, method="equal_width"):
    """
    Convert continuous numeric values into categorical bins.

    Parameters:
        data (array-like): Continuous numeric values.
        n_bins (int): Number of bins to divide the data into (default=4).
        method (str): 'equal_width' or 'equal_frequency' (default='equal_width').

    Returns:
        np.ndarray: Binned categorical values (0..n_bins-1).
    """
    data = np.array(data, dtype=float)
    min_val, max_val = data.min(), data.max()

    if min_val == max_val:
        # all values same â†’ single category
        return np.zeros_like(data, dtype=int)

    if method == "equal_width":
        bin_width = (max_val - min_val) / n_bins
        bins = np.floor((data - min_val) / bin_width).astype(int)
        bins[bins == n_bins] = n_bins - 1  # fix max edge case
    elif method == "equal_frequency":
        # quantile-based binning
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.quantile(data, quantiles)
        bins = np.digitize(data, bin_edges[1:], right=True)
    else:
        raise ValueError("Invalid method. Use 'equal_width' or 'equal_frequency'.")

    return bins


# ---------- Step 2: Probabilities & Entropy ----------
def get_probabilities(data):
    values, counts = np.unique(data, return_counts=True)
    return counts / counts.sum()


def calculate_entropy(data):
    probs = get_probabilities(data)
    return -np.sum(probs * np.log2(probs))


# ---------- Step 3: Information Gain ----------
def information_gain(feature, target, is_continuous=False, n_bins=4, bin_method="equal_width"):
    """
    Compute information gain of a feature with respect to the target.
    """
    if is_continuous:
        feature = bin_data(feature, n_bins=n_bins, method=bin_method)

    total_entropy = calculate_entropy(target)
    values, counts = np.unique(feature, return_counts=True)

    weighted_entropy = 0
    for v, count in zip(values, counts):
        subset_target = target[feature == v]
        weighted_entropy += (count / len(feature)) * calculate_entropy(subset_target)

    return total_entropy - weighted_entropy


# ---------- Step 4: Root Node Selection ----------
def find_root_node(features_df, target, continuous_features=None, n_bins=4, bin_method="equal_width"):
    """
    Identify the best feature for the root node using Information Gain.

    Parameters:
        features_df (pd.DataFrame): Feature dataset (categorical/continuous).
        target (array-like): Target labels.
        continuous_features (list): List of continuous feature names.
        n_bins (int): Number of bins for continuous features.
        bin_method (str): 'equal_width' or 'equal_frequency'.

    Returns:
        (str, dict): Best feature name and dictionary of IG values.
    """
    if continuous_features is None:
        continuous_features = []

    target = np.array(target)
    ig_values = {}

    for col in features_df.columns:
        feature = np.array(features_df[col])
        is_cont = col in continuous_features
        ig = information_gain(feature, target, is_continuous=is_cont,
                              n_bins=n_bins, bin_method=bin_method)
        ig_values[col] = ig

    root_feature = max(ig_values, key=ig_values.get)
    return root_feature, ig_values


# ---------- Step 5: Demo ----------
def demo():
    """Demonstrate with a toy dataset and both binning methods."""
    data = {
        "A1": ["Red", "Red", "Blue", "Blue", "Red", "Green"],
        "A2": [10, 20, 15, 30, 25, 35],  # continuous
        "A3": [100, 200, 300, 400, 250, 150]  # continuous
    }
    target = ["Yes", "No", "Yes", "Yes", "No", "Yes"]

    df = pd.DataFrame(data)
    continuous = ["A2", "A3"]

    print("\n--- Using Equal-Width Binning ---")
    root, ig_values = find_root_node(df, target, continuous_features=continuous,
                                     n_bins=3, bin_method="equal_width")
    print("Information Gain values:", ig_values)
    print("Selected Root Node:", root)

    print("\n--- Using Equal-Frequency Binning ---")
    root, ig_values = find_root_node(df, target, continuous_features=continuous,
                                     n_bins=3, bin_method="equal_frequency")
    print("Information Gain values:", ig_values)
    print("Selected Root Node:", root)


if __name__ == "__main__":
    demo()



--- Using Equal-Width Binning ---
Information Gain values: {'A1': np.float64(0.4591479170272448), 'A2': np.float64(0.9182958340544896), 'A3': np.float64(0.9182958340544896)}
Selected Root Node: A2

--- Using Equal-Frequency Binning ---
Information Gain values: {'A1': np.float64(0.4591479170272448), 'A2': np.float64(0.9182958340544896), 'A3': np.float64(0.9182958340544896)}
Selected Root Node: A2
