In [1]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [4]:
#a3
import numpy as np
import pandas as pd


# ---------- Step 1: Binning ----------
def equal_width_binning(data, n_bins=4):
    """
    Convert continuous numeric values into categorical bins using equal width binning.
    """
    data = np.array(data, dtype=float)
    min_val, max_val = data.min(), data.max()

    if min_val == max_val:  # edge case: all values same
        return np.zeros_like(data, dtype=int)

    bin_width = (max_val - min_val) / n_bins
    bins = np.floor((data - min_val) / bin_width).astype(int)
    bins[bins == n_bins] = n_bins - 1  # handle max edge case
    return bins


# ---------- Step 2: Probability & Entropy ----------
def get_probabilities(data):
    """
    Compute probability distribution of categorical values.
    """
    values, counts = np.unique(data, return_counts=True)
    probabilities = counts / counts.sum()
    return probabilities


def calculate_entropy(data):
    """
    Calculate entropy of a categorical array.
    """
    probabilities = get_probabilities(data)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy


# ---------- Step 3: Information Gain ----------
def information_gain(feature, target, is_continuous=False, n_bins=4):
    """
    Compute Information Gain of a feature with respect to the target.

    Parameters:
        feature (array-like): Feature values (categorical or continuous).
        target (array-like): Target labels (categorical).
        is_continuous (bool): Whether feature is continuous.
        n_bins (int): Number of bins if continuous.

    Returns:
        float: Information gain value.
    """
    if is_continuous:
        feature = equal_width_binning(feature, n_bins=n_bins)

    total_entropy = calculate_entropy(target)
    values, counts = np.unique(feature, return_counts=True)

    weighted_entropy = 0
    for v, count in zip(values, counts):
        subset_target = target[feature == v]
        subset_entropy = calculate_entropy(subset_target)
        weighted_entropy += (count / len(feature)) * subset_entropy

    gain = total_entropy - weighted_entropy
    return gain


# ---------- Step 4: Root Node Selection ----------
def find_root_node(features_df, target, continuous_features=None, n_bins=4):
    """
    Select the best feature for the root node using Information Gain.

    Parameters:
        features_df (pd.DataFrame): Feature dataset (categorical/continuous).
        target (array-like): Target labels.
        continuous_features (list): Names of continuous features.
        n_bins (int): Number of bins for continuous features.

    Returns:
        str: Feature name with highest Information Gain.
    """
    if continuous_features is None:
        continuous_features = []

    target = np.array(target)
    ig_values = {}

    for col in features_df.columns:
        feature = np.array(features_df[col])
        is_cont = col in continuous_features
        ig = information_gain(feature, target, is_continuous=is_cont, n_bins=n_bins)
        ig_values[col] = ig

    # Pick feature with max IG
    root_feature = max(ig_values, key=ig_values.get)
    return root_feature, ig_values


# ---------- Step 5: Demo ----------
def demo():
    """Demonstration with toy dataset."""
    # Toy dataset
    data = {
        "Outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain",
                    "Overcast", "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
        "Temperature": [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
        "Humidity": [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 91],
        "Windy": ["False", "True", "False", "False", "False", "True", "True",
                  "False", "False", "False", "True", "True", "False", "True"]
    }
    target = ["No", "No", "Yes", "Yes", "Yes", "No", "Yes",
              "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]

    df = pd.DataFrame(data)

    # Define continuous features
    continuous = ["Temperature", "Humidity"]

    # Find root node
    root, ig_values = find_root_node(df, target, continuous_features=continuous, n_bins=4)

    print("Information Gain values:", ig_values)
    print("Selected Root Node Feature:", root)


if __name__ == "__main__":
    demo()


Information Gain values: {'Outlook': np.float64(0.24674981977443933), 'Temperature': np.float64(0.13706363739659322), 'Humidity': np.float64(0.3617241393770687), 'Windy': np.float64(0.04812703040826949)}
Selected Root Node Feature: Humidity
