In [2]:
import pandas as pd
import numpy as np
from math import log2

# Function to compute entropy of a series
def entropy(series, bins=10):
    # Drop NaN values
    series = series.dropna()
    
    if series.empty:
        return 0.0
    
    # If continuous, bin values
    if np.issubdtype(series.dtype, np.number):
        counts, _ = np.histogram(series, bins=bins)
    else:
        counts = series.value_counts().values

    probs = counts / counts.sum()
    return -np.sum([p * log2(p) for p in probs if p > 0])

# Function to compute entropy for each column of dataset
def dataset_entropies(path, dataset_name):
    data = pd.read_csv(path)
    print(f"\n=== Entropies for {dataset_name} ===")
    
    for col in data.columns:
        h = entropy(data[col])
        print(f"{col}: {h:.4f}")

# Paths for datasets
datasets = {
    "Advertising": "../Lab1/data/advertising.csv",
    "Housing": "../Lab1/data/Housing.csv",
    "FAA_AI": "../Lab2/data/faa_ai_prelim.csv"
}

# Run entropy calculation
for name, path in datasets.items():
    dataset_entropies(path, name)



=== Entropies for Advertising ===
TV: 3.2985
Radio: 3.2896
Newspaper: 2.7522
Sales: 3.0530

=== Entropies for Housing ===
price: 2.5584
area: 2.4166
bedrooms: 1.5784
bathrooms: 0.9448
stories: 1.6013
mainroad: 0.5876
guestroom: 0.6756
basement: 0.9345
hotwaterheating: 0.2686
airconditioning: 0.8995
parking: 1.5476
prefarea: 0.7864
furnishingstatus: 1.5573

=== Entropies for FAA_AI ===
UPDATED: 0.2243
ENTRY_DATE: 2.7897
EVENT_LCL_DATE: 3.8502
EVENT_LCL_TIME: 6.1973
LOC_CITY_NAME: 6.2787
LOC_STATE_NAME: 4.3330
LOC_CNTRY_NAME: -0.0000
RMK_TEXT: 6.3268
EVENT_TYPE_DESC: 0.9101
FSDO_DESC: 5.2054
REGIST_NBR: 6.3750
FLT_NBR: 2.5850
ACFT_OPRTR: 2.5216
ACFT_MAKE_NAME: 3.8938
ACFT_MODEL_NAME: 5.5629
ACFT_MISSING_FLAG: 0.0000
ACFT_DMG_DESC: 1.8199
FLT_ACTIVITY: 2.0382
FLT_PHASE: 1.8600
FAR_PART: -0.0000
MAX_INJ_LVL: 1.8474
FATAL_FLAG: -0.0000
FLT_CRW_INJ_NONE: 0.6985
FLT_CRW_INJ_MINOR: 0.9183
FLT_CRW_INJ_SERIOUS: -0.0000
FLT_CRW_INJ_FATAL: 1.0000
FLT_CRW_INJ_UNK: 0.0000
CBN_CRW_INJ_NONE: 0.0000
C