In [None]:
# -------------------------------------------
# 01_load_and_explore.ipynb
# Author: Ed | Purpose: Benchmark ML models on a dataset
# -------------------------------------------

# -------------------------------------------
# 1. CONFIGURE AND LOAD THE DATASET
# -------------------------------------------

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# -------------------------------------------
# CENTRALIZED CONFIGURATION SYSTEM
# -------------------------------------------
# This project is multiple Jupyter notebooks (Python code), with a pre-defined directory (folders) 
# structure, and uses a centralized config.py file for all path management.
#
# Benefits may include:
# - Single source of truth for all project paths
# - Easy to change project location (update config.py only)
# - Consistent paths across all pipeline stages (Python modules)
# - Professional project structure
# 
# The config.py file is located in the project root directory and contains
# all directory paths and file paths.
# -------------------------------------------


# Import config paths
import sys
sys.path.append('..')
from config import DATASETS_DIR, CLEAN_DATA_DIR

# Set dataset parameters here to work with different datasets.
# Change the dataset name, the dataset's Excel (or .csv) file name with extension,
# label column, and any columns to force categorical or to drop.
# This allows for reuse of the code across different datasets.

DATASET_NAME = "DryBean"
DATASET_PATH = DATASETS_DIR / "Dry_Bean_Dataset.xlsx"

LABEL_COLUMN = "Class"  # <-- change if using another dataset
# ID which cols are skipped from numeric diagnostics, or dropped.
FORCE_CATEGORICAL = []  # Optional. E.g. ['Zip', 'ProductID']. 
DROP_COLUMNS = []       # Optional. E.g. ['ID']


def load_dataset(path):
    if str(path).endswith(".csv"):
        df = pd.read_csv(path)
    elif str(path).endswith((".xls", ".xlsx")):
        df = pd.read_excel(path)
    else:
        raise ValueError("Unsupported file format.")
    return df

df = load_dataset(DATASET_PATH)
print(f"Loaded dataset: {DATASET_NAME}")
print(f"Shape: {df.shape}")
display(df.head())

In [None]:
# -------------------------------------------
# 2. DETERMINE FEATURES AND LABEL(S) TYPES
# -------------------------------------------

# Function to detect numerical and categorical columns.
# Written to allow identifying features (column names) that should be treated as categorical, 
# even if they contain only numeric values. (E.g. ZipCode, ProductID). -- Not the case with Dry Bean dataset.
def detect_column_types(df, label_column, force_categoricals=None):
    numericals = df.select_dtypes(include=["int", "float"]).columns.tolist()
    categoricals = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    # Validate and apply forced categoricals
    if force_categoricals:
        print("Validating FORCED CATEGORICAL columns...")
        for col in force_categoricals:
            if col not in df.columns:
                print(f"⚠️ Warning: '{col}' not found in dataset and will be ignored.")
            else:
                if col not in categoricals:
                    categoricals.append(col)
                if col in numericals:
                    numericals.remove(col)

    # Remove label column from both lists
    if label_column in numericals:
        numericals.remove(label_column)
    if label_column in categoricals:
        categoricals.remove(label_column)

    return sorted(numericals), sorted(categoricals)

# Display which columns are numerical and categorical
numericals, categoricals = detect_column_types(df, LABEL_COLUMN, FORCE_CATEGORICAL)

print(f"Numerical columns ({len(numericals)}): {numericals}")
print(f"Categorical columns ({len(categoricals)}): {categoricals}")

In [None]:
# -------------------------------------------
# 3. ASSESS THE DATA   
# -------------------------------------------

# Drop columns identified in prior module. Confirm each column exists in the DataFrame
if DROP_COLUMNS:
    existing_to_drop = [col for col in DROP_COLUMNS if col in df.columns]
    if existing_to_drop:
        df.drop(columns=existing_to_drop, inplace=True)
        print(f"🗑️ Dropped columns: {existing_to_drop}")
    else:
        print(f"⚠️ None of the specified DROP_COLUMNS were found in dataset.")

# Print data types and missing values
display(df.info())
display(df.describe(include="all").T)

# Count and display missing values in each column
nulls = df.isnull().sum()
print("Null values by column:")
display(nulls[nulls > 0])

# Reveal if there are balanced or imbalanced classes in the label column
if LABEL_COLUMN in df.columns:
    print("Label distribution (Count and Percentage):")
    
    # Get both counts and percentages
    counts = df[LABEL_COLUMN].value_counts().sort_index()
    percentages = df[LABEL_COLUMN].value_counts(normalize=True).mul(100).round(2).sort_index()
    
    # Create a combined table
    distribution_df = pd.DataFrame({
        'Count': counts,
        'Percentage': percentages
    })
    
    display(distribution_df)
    
    # Also show total for verification
    print(f"Total records: {len(df):,}")
    print(f"Verification: {distribution_df['Count'].sum():,} records")
    
else:
    print("⚠️ Warning: Label column not found.")


# Calculate coefficient of variation for numerical features
if numericals:
    cv_stats = df[numericals].std() / df[numericals].mean()
    cv_df = pd.DataFrame({
        'Feature': cv_stats.index,
        'Coefficient_of_Variation': cv_stats.values
    }).sort_values('Coefficient_of_Variation', ascending=False)
    
    print("Coefficient of Variation (CV) for numerical features:")
    print("(Higher CV = more relative variability)")
    display(cv_df)    

 # Verify whether Solidity truly shows high within-class spread for Sira and Dermason
print("\nVerifying Solidity spread for Sira and Dermason:")
df[df['Class'].isin(['SIRA', 'DERMASON'])].groupby('Class')['S'].agg(['mean', 'std'])

In [None]:
# -------------------------------------------
# 4. CREATE FEATURE DIAGNOSTICS VISUALS
# -------------------------------------------

# Prepare histograms for numeric features
df[numericals].hist(figsize=(16, 10), bins=30)
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

# Prepare the correlation matrix
corr = df[numericals].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix", fontsize=16)
plt.show()

# Display the number of records for each bean type, as one way to show class imbalance.
if LABEL_COLUMN in df.columns:
    sns.countplot(x=df[LABEL_COLUMN])
    plt.title("Target Label Distribution")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()



# Make it easy to select pairs of features to display pair plots.
# Comes in handy later for exploring/explaining accuracy results. 
import plotly.express as px

for x, y in [('P', 'CO'), ('S', 'SF2'), ('l', 'SF3')]:
    fig = px.scatter(df, x=x, y=y, color='Class')
    fig.show()




# Results of later modules in this pipeline found that the most common
# bean incorrectly labeled by the models was Sira.  And nearly
# 75% of these errors classified the Sira bean as a Dermason.
#
# The code below attempts to determine why.
#
# The code below identifies feature pairs that have the strongest combination of:
#    1. Proximity in feature space (small Euclidean distance between means)
#    2. Large within-class dispersion (large standard deviation across both bean types).
# 
# This means the two classes are not only close together, 
# but diffuse enough to bleed into each other’s zones, a prime 
# condition for misclassification.

# Then, displaying a pair plot of the pairs identified, show a visual 
# jumble or strong overlap between Sira and Dermason dots,
# making it clear why models struggled to tell them apart.



# Identify all 16 features 
features = ["A", "P", "L", "l", "K", "Ec", "C", "Ed", "Ex", "S", "R", "CO", "SF1", "SF2", "SF3", "SF4"]

results = []

for f1 in features:
    for f2 in features:
        if f1 == f2: continue

        # Subset
        subset = df[df['Class'].isin(['SIRA', 'DERMASON'])]

        # Get means
        m_sira = subset[subset['Class'] == 'SIRA'][[f1, f2]].mean()
        m_derm = subset[subset['Class'] == 'DERMASON'][[f1, f2]].mean()

        # Euclidean distance between centroids
        dist = ((m_sira - m_derm) ** 2).sum() ** 0.5

        # Sum of standard deviations
        sd_sira = subset[subset['Class'] == 'SIRA'][[f1, f2]].std().sum()
        sd_derm = subset[subset['Class'] == 'DERMASON'][[f1, f2]].std().sum()
        total_sd = sd_sira + sd_derm

        results.append((f1, f2, dist, total_sd))

# Sort by: small centroid distance, high variance
ranked = sorted(results, key=lambda x: (x[2], -x[3]))


# Display the list of the top 10 pairs, showing:
#   1. Feature names
#   2. How close the class centroids are
#   3. How widely spread the data are for those features.

# Print the top 10 most confusion-prone pairs
print("Top 10 feature pairs with high Sira–Dermason overlap:\n")
for i, (f1, f2, dist, sd) in enumerate(ranked[:10], start=1):
    print(f"{i}. {f1} vs {f2} | Centroid Distance: {dist:.4f} | Combined Std Dev: {sd:.4f}")





In [None]:
# -------------------------------------------
# 5. EXPORT COLUMN LISTS
# -------------------------------------------

# Save lists of features for use in the next module for feature-specific preprocessing

#### out_dir = Path("C:/Misc/ml_benchmark/outputs/clean_data/")
out_dir = CLEAN_DATA_DIR
out_dir.mkdir(parents=True, exist_ok=True)

# Log all feature names that are numericals, and all that are categoricals.
# Aids in how each might be preprocessed later (e.g., scale a numeric, encode a categorial).
pd.Series(numericals).to_csv(out_dir / f"{DATASET_NAME}_numericals.txt", index=False, header=False)
pd.Series(categoricals).to_csv(out_dir / f"{DATASET_NAME}_categoricals.txt", index=False, header=False)

print(f"Feature lists saved to: {out_dir}")

In [None]:
# -------------------------------------------
# 6. SAVE DATAFRAME FOR USE IN OTHER NOTEBOOKS
# -------------------------------------------

# Export working dataset (dataframe df) for downstream reuse, or use in other workbooks.
# NOTE: Requires pyarrow or fastparquet. Install via: pip install pyarrow

df.to_parquet(CLEAN_DATA_DIR / "DryBean_clean.parquet")

print("Saved cleaned dataframe to .parquet for reuse")