In [1]:
from google.colab import files
uploaded = files.upload()

Saving ptbxl_database.csv to ptbxl_database.csv
Saving scp_statements.csv to scp_statements.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving 01000_lr.dat to 01000_lr.dat
Saving 01000_lr.hea to 01000_lr.hea
Saving 01001_lr.dat to 01001_lr.dat


In [6]:
#a5
import numpy as np
import pandas as pd


# ===========================
# 1. BINNING UTILITIES
# ===========================
def bin_data(data, n_bins=4, method="equal_width"):
    """
    Converts continuous numeric data into categorical bins.

    Parameters:
        data (array-like): Continuous numeric values.
        n_bins (int): Number of bins (default=4).
        method (str): 'equal_width' or 'equal_frequency'.

    Returns:
        np.ndarray: Categorical bin indices.
    """
    data = np.array(data, dtype=float)
    min_val, max_val = data.min(), data.max()

    if min_val == max_val:
        return np.zeros_like(data, dtype=int)

    if method == "equal_width":
        bin_width = (max_val - min_val) / n_bins
        bins = np.floor((data - min_val) / bin_width).astype(int)
        bins[bins == n_bins] = n_bins - 1
    elif method == "equal_frequency":
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.quantile(data, quantiles)
        bins = np.digitize(data, bin_edges[1:], right=True)
    else:
        raise ValueError("Invalid method: choose 'equal_width' or 'equal_frequency'")
    return bins


# ===========================
# 2. ENTROPY & INFO GAIN
# ===========================
def get_probabilities(data):
    values, counts = np.unique(data, return_counts=True)
    return counts / counts.sum()


def calculate_entropy(data):
    probs = get_probabilities(data)
    return -np.sum(probs * np.log2(probs))


def information_gain(feature, target, is_continuous=False, n_bins=4, bin_method="equal_width"):
    if is_continuous:
        feature = bin_data(feature, n_bins=n_bins, method=bin_method)

    total_entropy = calculate_entropy(target)
    values, counts = np.unique(feature, return_counts=True)

    weighted_entropy = sum(
        (count / len(feature)) * calculate_entropy(target[feature == v])
        for v, count in zip(values, counts)
    )

    return total_entropy - weighted_entropy


# ===========================
# 3. TREE NODE CLASS
# ===========================
class TreeNode:
    def __init__(self, feature=None, children=None, prediction=None):
        self.feature = feature
        self.children = children if children else {}
        self.prediction = prediction


# ===========================
# 4. TREE BUILDER
# ===========================
def build_decision_tree(features_df, target, continuous_features=None,
                        n_bins=4, bin_method="equal_width", depth=0, max_depth=None,
                        min_info_gain=0.0):
    """
    Recursively builds a decision tree.

    Parameters:
        features_df (pd.DataFrame): Feature matrix.
        target (array-like): Target labels.
        continuous_features (list): Names of continuous features.
        n_bins (int): Number of bins for continuous features.
        bin_method (str): 'equal_width' or 'equal_frequency'.
        depth (int): Current depth (for recursion).
        max_depth (int): Maximum depth to stop splitting.
        min_info_gain (float): Minimum IG to split further.

    Returns:
        TreeNode: Root node of the decision tree.
    """
    if continuous_features is None:
        continuous_features = []

    target = np.array(target)

    # Base cases
    if len(np.unique(target)) == 1:
        return TreeNode(prediction=target[0])

    if features_df.shape[1] == 0 or (max_depth is not None and depth >= max_depth):
        majority = pd.Series(target).mode()[0]
        return TreeNode(prediction=majority)

    # Select feature with max information gain
    ig_values = {}
    for col in features_df.columns:
        ig_values[col] = information_gain(
            np.array(features_df[col]), target,
            is_continuous=(col in continuous_features),
            n_bins=n_bins, bin_method=bin_method
        )
    best_feature = max(ig_values, key=ig_values.get)

    if ig_values[best_feature] < min_info_gain:
        majority = pd.Series(target).mode()[0]
        return TreeNode(prediction=majority)

    node = TreeNode(feature=best_feature)
    feature_values = features_df[best_feature]

    # Apply binning if needed
    if best_feature in continuous_features:
        feature_values = bin_data(feature_values, n_bins=n_bins, method=bin_method)

    for val in np.unique(feature_values):
        subset_mask = feature_values == val
        subset_features = features_df.loc[subset_mask].drop(columns=[best_feature])
        subset_target = target[subset_mask]

        node.children[val] = build_decision_tree(
            subset_features, subset_target,
            continuous_features, n_bins, bin_method,
            depth + 1, max_depth, min_info_gain
        )
    return node


# ===========================
# 5. PREDICTION FUNCTIONS
# ===========================
def majority_class(node):
    counts = {}
    for child in node.children.values():
        if child.prediction:
            counts[child.prediction] = counts.get(child.prediction, 0) + 1
    return max(counts, key=counts.get) if counts else None


def predict_single(node, sample):
    while node.prediction is None:
        feature_value = sample[node.feature]
        if feature_value not in node.children:
            return majority_class(node)
        node = node.children[feature_value]
    return node.prediction


def predict(tree, samples_df):
    return [predict_single(tree, row) for _, row in samples_df.iterrows()]


# ===========================
# 6. TREE VISUALIZATION
# ===========================
def print_tree(node, depth=0):
    prefix = "  " * depth
    if node.prediction is not None:
        print(f"{prefix}Leaf → Predict: {node.prediction}")
    else:
        print(f"{prefix}[Split on: {node.feature}]")
        for val, child in node.children.items():
            print(f"{prefix} └─ Value = {val}")
            print_tree(child, depth + 1)


# ===========================
# 7. DEMO / TEST
# ===========================
def demo():
    data = {
        "Color": ["Red", "Red", "Blue", "Blue", "Green", "Red"],
        "Size": [10, 20, 15, 30, 25, 35],
        "Shape": ["Round", "Square", "Round", "Square", "Round", "Round"]
    }
    target = ["Yes", "No", "Yes", "Yes", "No", "Yes"]

    df = pd.DataFrame(data)
    continuous = ["Size"]

    tree = build_decision_tree(df, target,
                               continuous_features=continuous,
                               n_bins=3, bin_method="equal_width",
                               max_depth=3, min_info_gain=0.01)

    print("\n=== DECISION TREE ===")
    print_tree(tree)

    test_df = pd.DataFrame({
        "Color": ["Red", "Green"],
        "Size": [18, 33],
        "Shape": ["Round", "Square"]
    })
    preds = predict(tree, test_df)
    print("\nPredictions:", preds)


if __name__ == "__main__":
    demo()



=== DECISION TREE ===
[Split on: Size]
 └─ Value = 0
  Leaf → Predict: Yes
 └─ Value = 1
  Leaf → Predict: No
 └─ Value = 2
  Leaf → Predict: Yes

Predictions: [np.str_('Yes'), np.str_('Yes')]
