Develop and save to an Excel file various metrics about a dataset.

Two sets of metrics are defined:

1. Various measures that are not includedin Python's problexity (e.g., Kurtosis and skewness)
2. The 22 Lorena et al. complexity measures produced by Python's problexity.



In [None]:
# -------------------------------------------
# 1. LOAD LIBRARIES AND CONFIG
# -------------------------------------------


# Import config paths so they are accessible to module.
import sys
sys.path.insert(0, "C:/pathnaame")
import config
import importlib
importlib.reload(config)    # Reload config to ensure latest edits are active

The two code cells that follow generate and record measures for each dataset, and then creates and save an Excel file with the results, one row for each dataset.

The first module defines the functions to develop individual dataset measures that are not the dataset complexity measures defined by Lorena et al. and generated by problexity.  This first module defines a wrapper function that is then called to generate all of the individual non-Lorena et al. dataset measures.

The second module runs the wrapper function and a problexity code block inside a loop that reads all datasets in the identified path. The problexity module calculates the 22 Lorena et al. complexity measures (plus a combined complexity "score"). The module creates and saves an Excel file of the results, one row for each dataset.  Each Excel worksheet column is one of the dataset measures.

In [None]:
# -------------------------------------------
# 2. DEFINE FUNCTIONS TO DEVELOP METRICS OF A DATASET
# -------------------------------------------

# The functions below define individual dataset metrics (metadata) that are not generated by problexity.
# These functions then are called from the wrapper function defined at the bottom of this section.

import pandas as pd


def get_shape_info(df: pd.DataFrame) -> dict:
    """Return number of rows and columns (columns include the target)."""
    n_samples, n_features = df.shape
    return {"n_samples": n_samples, "n_features": n_features}

def get_feature_types(df: pd.DataFrame, target_col: str) -> dict:
    """Count numeric vs categorical features (excluding the target)."""
    X = df.drop(columns=[target_col])
    numeric_cols = X.select_dtypes(include="number").columns.tolist()
    categorical_cols = X.select_dtypes(exclude="number").columns.tolist()
    return {
        "n_numeric_features": len(numeric_cols),
        "n_categorical_features": len(categorical_cols),
        # returned for convenience if you need them downstream
        "numeric_columns": numeric_cols,
        "categorical_columns": categorical_cols,
    }

def get_class_imbalance(df: pd.DataFrame, target_col: str) -> dict:
    """Return class counts and majority/minority ratio."""
    counts = df[target_col].value_counts(dropna=False)
    if counts.empty or len(counts) == 1:
        imbalance = float("inf")  # or None, if you prefer
    else:
        imbalance = counts.max() / counts.min()
    return {
        "class_counts": counts.to_dict(),
        "class_imbalance": float(imbalance),
    }


# variance_near_zero_count function

import numpy as np
import pandas as pd

def variance_near_zero_count(df: pd.DataFrame, eps: float = 1e-12) -> dict:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=["target"], errors="ignore").select_dtypes(include="number")
    if X_num.shape[1] == 0:
        return {"variance_near_zero_count": 0, "near_zero_columns": []}

    # Variance with NaN-safe behavior (treat all-NaN as zero variance)
    variances = X_num.var(axis=0, ddof=0).fillna(0.0)
    near_zero_cols = variances.index[variances <= eps].tolist()

    return {
        "variance_near_zero_count": len(near_zero_cols),
        "near_zero_columns": near_zero_cols
    }


# Unique Categorical Values function: 
# Find the categorical feature with the highest number of unique values 

def max_unique_categorical_values(df: pd.DataFrame, target_col: str) -> dict:

    # Exclude target; operate on categorical features only
    X_cat = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=["object", "category"])
    if X_cat.shape[1] == 0:
        return {"max_unique_values": 0, "max_unique_column": None}

    # Find the column with the maximum number of unique values
    unique_counts = X_cat.nunique()
    max_unique_col = unique_counts.idxmax()
    max_unique_val = unique_counts.max()

    return {
        "max_unique_values": max_unique_val,
        "max_unique_column": max_unique_col
    }


# Sparsity Percent:  Measure the fraction of zero entries in the
# numeric features of the dataset, expressed as a percentage.

def sparsity_percent(df: pd.DataFrame, target_col: str) -> float:
    """Calculate the sparsity percentage of numeric features."""
    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    if X_num.shape[1] == 0:
        return 0.0

    # Calculate sparsity
    total_entries = X_num.size
    zero_entries = (X_num == 0).sum().sum()
    sparsity = zero_entries / total_entries * 100

    return sparsity


# A revised Sparsity Percent:  Measure the fraction of zero entries in the
# numeric features of the dataset, but then adjust for the 
# categorical features, which are 100% full of data.

def adjusted_sparsity_percent(df: pd.DataFrame, target_col: str) -> float:
    """Calculate adjusted sparsity percentage including categorical features."""
    import numpy as np

    # Separate numeric and categorical features
    X = df.drop(columns=[target_col], errors="ignore")
    X_num = X.select_dtypes(include="number")
    X_cat = X.select_dtypes(include=["object", "category"])

    # Count zero entries in numeric features
    zero_entries = (X_num == 0).sum().sum()

    # Total entries across all features
    total_entries = X_num.size + X_cat.shape[0] * X_cat.shape[1]

    if total_entries == 0:
        return 0.0

    # Adjusted sparsity
    adjusted_sparsity = zero_entries / total_entries * 100
    return round(adjusted_sparsity, 4)



#  n_onehot_features:  Determine the number of expanded columns 
#  that would be created by one-hot. To calculate this:  For each categorical 
#  feature: (a) Count the number of unique values (cardinality), then
#  (b) Sum these counts across all categorical features.


def n_onehot_features(df: pd.DataFrame, target_col: str) -> int:

    # Exclude target; operate on categorical features only
    X_cat = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=["object", "category"])
    if X_cat.shape[1] == 0:
        return 0

    # Sum the number of unique values across all categorical features
    total_onehot_features = X_cat.nunique().sum()

    return total_onehot_features


#  Total features after encoding:  Determine total number of 
#  features there would be after encoding all categorical features.  
#  Determined as sum of Number of Numerical Features and n-onehot_features.

def total_features_after_encoding(df: pd.DataFrame, target_col: str) -> int:

    n_numeric = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number").shape[1]
    n_onehot = n_onehot_features(df, target_col=target_col)
    return n_numeric + n_onehot


#  feature_cardinality_stats:  Determine the min/median/max 
#  unique values per (original) categorical feature. Store as a string 
#  in this field (e.g., feature_cardinality_stats = "min=2, median=8, max=30000")

def feature_cardinality_stats(df: pd.DataFrame, target_col: str) -> str:

    # Exclude target; operate on categorical features only
    X_cat = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=["object", "category"])
    if X_cat.shape[1] == 0:
        return "min=0, median=0, max=0"

    unique_counts = X_cat.nunique()
    min_unique = unique_counts.min()
    median_unique = int(unique_counts.median())
    max_unique = unique_counts.max()

    return f"min={min_unique}, median={median_unique}, max={max_unique}"


#  Number of Categorical Features with more than n (e.g., 50)
#  unique values: Determine how many of the categorical features 
#  have more than min_num_of_unique_values unique values.
def n_categorical_features_gt50(df: pd.DataFrame, target_col: str, threshold: int = 50) -> int:

    # Exclude target; operate on categorical features only
    X_cat = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=["object", "category"])
    if X_cat.shape[1] == 0:
        return 0

    # Count features with unique values greater than the threshold
    count_gt_threshold = int((X_cat.nunique() > threshold).sum())

    return count_gt_threshold


# Mean Absolute Correlation: Calculate the mean of the absolute Pearson
# correlations among numeric features (excluding self-correlations).

def mean_abs_corr(df: pd.DataFrame, target_col: str) -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    # Drop columns with zero variance
    X_num = X_num.loc[:, X_num.var(axis=0, ddof=0) > 0]
    if X_num.shape[1] < 2:
        return 0.0  # Not enough numeric features to compute correlations
    corr_matrix = X_num.corr().abs()
    # Exclude self-correlations by masking the diagonal
    mask = ~np.eye(corr_matrix.shape[0], dtype=bool)
    mean_corr = np.nanmean(corr_matrix.values[mask])
    return 1 / (mean_corr + 1e-6)  # Inverted for complexity (higher is more complex)


# Calculate the share of redundant features

def correlation_redundancy(df: pd.DataFrame, target_col: str, threshold: float = 0.95) -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    # Drop columns with zero variance
    X_num = X_num.loc[:, X_num.var(axis=0, ddof=0) > 0]
    n = X_num.shape[1]
    if n < 2:
        return 0.0  # Not enough numeric features to compute correlations
    corr_matrix = X_num.corr().abs()
    # Exclude self-correlations by masking the diagonal
    mask = ~np.eye(n, dtype=bool)
    corr_vals = corr_matrix.values[mask]
    # Count pairs with correlation >= threshold
    n_pairs = len(corr_vals)
    n_redundant = np.sum(corr_vals >= threshold)
    redundancy_ratio = float(n_redundant) / n_pairs if n_pairs > 0 else 0.0
    return 1 / (redundancy_ratio + 1e-6)    #Inverted for complexity (higher is more complex)



# Calculate the mean mutual information of the top-10 features with the label
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

def mi_top10_mean(df, target_col: str, task_type: str = "classification") -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    y = df[target_col]
    # Drop columns with zero variance
    X_num = X_num.loc[:, X_num.var(axis=0, ddof=0) > 0]
    if X_num.shape[1] == 0:
        return 0.0
    # Choose MI function based on task type
    if task_type == "classification":
        mi = mutual_info_classif(X_num, y, discrete_features="auto")
    else:
        mi = mutual_info_regression(X_num, y)
    # Get top-10 MI values
    top10 = sorted(mi, reverse=True)[:10]
    if len(top10) == 0:
        return 0.0
    return float(np.mean(top10))


# Calculate the mean mutual information score for all numeric feature pairs (excluding the target)
# Must use mutual_info_regression for this function b/c continuous numeric features.
from sklearn.feature_selection import mutual_info_regression
import numpy as np

def mean_mi_score(df, target_col: str) -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    if X_num.shape[1] < 2:
        return 0.0
    mi_scores = []
    for i, col1 in enumerate(X_num.columns):
        for j, col2 in enumerate(X_num.columns):
            if i < j:
                # MI between col1 and col2 (both continuous)
                mi = mutual_info_regression(X_num[[col1]], X_num[col2])
                mi_scores.append(float(mi[0]))
    if len(mi_scores) == 0:
        return 0.0
    return float(np.mean(mi_scores))



# Calculate the mean overall class-wise mutual information score for all features
# Must use mutual_info_classification for this function b/c of categorical features.
# Drop rows with NaNs to avoid errors.

def mean_mutual_info(df: pd.DataFrame, target_col: str) -> float:
    from sklearn.feature_selection import mutual_info_classif
    from sklearn.preprocessing import OrdinalEncoder

    # Separate features and target
    X = df.drop(columns=[target_col], errors="ignore")
    y = df[target_col]

    # Encode categorical features
    X_encoded = X.copy()
    cat_cols = X.select_dtypes(include=["object", "category"]).columns
    if len(cat_cols) > 0:
        X_encoded[cat_cols] = OrdinalEncoder().fit_transform(X[cat_cols])

    # Drop rows with any NaNs
    valid_mask = X_encoded.notna().all(axis=1) & y.notna()
    X_encoded = X_encoded[valid_mask]
    y = y[valid_mask]

    # Safety check: if no rows remain, return 0.0
    if X_encoded.shape[0] == 0 or X_encoded.shape[1] == 0:
        return 0.0

    # Compute mutual information
    mi = mutual_info_classif(X_encoded, y, discrete_features='auto')
    mean_mi = float(np.mean(mi))
    return round(1 / (mean_mi + 1e-6), 4)   # Inverted for complexity (higher is more complex)



# Calculate the mean class-wise mutual information score for the top 5 features
# Drop NaNs to avoid errors.

def top5_mean_mutual_info(df: pd.DataFrame, target_col: str) -> float:
    from sklearn.feature_selection import mutual_info_classif
    from sklearn.preprocessing import OrdinalEncoder

    # Separate features and target
    X = df.drop(columns=[target_col], errors="ignore")
    y = df[target_col]

    # Encode categorical features
    X_encoded = X.copy()
    cat_cols = X.select_dtypes(include=["object", "category"]).columns
    if len(cat_cols) > 0:
        X_encoded[cat_cols] = OrdinalEncoder().fit_transform(X[cat_cols])

    # Drop rows with any NaNs
    valid_mask = X_encoded.notna().all(axis=1) & y.notna()
    X_encoded = X_encoded[valid_mask]
    y = y[valid_mask]

    # Safety check: if no rows remain, return 0.0
    if X_encoded.shape[0] == 0 or X_encoded.shape[1] == 0:
        return 0.0

    # Compute mutual information
    mi = mutual_info_classif(X_encoded, y, discrete_features='auto')

    # Compute top-5 mean MI
    top5_mean = np.mean(np.sort(mi)[-5:]) if len(mi) >= 5 else np.mean(mi)
    return round(1 / (top5_mean + 1e-6), 4)   # Inverted for complexity (higher is more complex)



# Calculate the class-wise mutual information score for the single feature with the highest MI
# Drop NaNs to avoid errors.

def max_mutual_info(df: pd.DataFrame, target_col: str) -> float:
    from sklearn.feature_selection import mutual_info_classif
    from sklearn.preprocessing import OrdinalEncoder

    # Separate features and target
    X = df.drop(columns=[target_col], errors="ignore")
    y = df[target_col]

    # Encode categorical features
    X_encoded = X.copy()
    cat_cols = X.select_dtypes(include=["object", "category"]).columns
    if len(cat_cols) > 0:
        X_encoded[cat_cols] = OrdinalEncoder().fit_transform(X[cat_cols])

    # Drop rows with any NaNs
    valid_mask = X_encoded.notna().all(axis=1) & y.notna()
    X_encoded = X_encoded[valid_mask]
    y = y[valid_mask]

    # Safety check: if no rows remain, return 0.0
    if X_encoded.shape[0] == 0 or X_encoded.shape[1] == 0:
        return 0.0

    # Compute mutual information
    mi = mutual_info_classif(X_encoded, y, discrete_features='auto')
    max_mi = float(np.max(mi))
    return round(1 / (max_mi + 1e-6), 4)   # Inverted for complexity (higher is more complex)




# Mean Kurtosis: Calculate the average kurtosis across all numeric features
from scipy.stats import kurtosis
import numpy as np

def mean_kurtosis(df, target_col: str) -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    if X_num.shape[1] == 0:
        return 0.0
    # Calculate kurtosis for each numeric feature (axis=0: columns)
    kurt_vals = kurtosis(X_num, axis=0, nan_policy="omit", fisher=True)
    # Replace nan with 0 for mean calculation
    kurt_vals = np.nan_to_num(kurt_vals)
    return float(np.mean(kurt_vals))


# Calculate mean absolute skewness for all numeric features

from scipy.stats import skew
import numpy as np

def mean_abs_skewness(df, target_col: str) -> float:

    # Exclude target; operate on numeric features only
    X_num = df.drop(columns=[target_col], errors="ignore").select_dtypes(include="number")
    if X_num.shape[1] == 0:
        return 0.0
    # Calculate skewness for each numeric feature (axis=0: columns)
    skew_vals = skew(X_num, axis=0, nan_policy="omit")
    # Take absolute value and replace nan with 0 for mean calculation
    abs_skew_vals = np.abs(skew_vals)
    abs_skew_vals = np.nan_to_num(abs_skew_vals)
    return float(np.mean(abs_skew_vals))




# --- Wrapper Function (calls the functions above) ---
# To make higher values of each dataset metric indicate a higher dataset complexity,
# some metrics are inverted (e.g., class imbalance becomes 1 / class imbalance).

def dataset_metadata(df: pd.DataFrame, target_col: str, file_path: str = None, infrastructure_info: str = None) -> dict:
    """
    Minimal version: bundles the first three metrics (shape, feature types, class imbalance).
    Add more fields here later as you implement additional helpers.
    """
    meta = {}

    # 1) shape
    meta.update(get_shape_info(df))

    # 2) feature types
    ft = get_feature_types(df, target_col=target_col)
    meta.update({
        "n_numeric_features": ft["n_numeric_features"],
        "n_categorical_features": ft["n_categorical_features"],
    })


    # 3) class imbalance
    meta.update(get_class_imbalance(df, target_col=target_col))

    # 4) variance near zero
    vnz = variance_near_zero_count(df)
    meta.update({
        "variance_near_zero_count": vnz["variance_near_zero_count"]
    })

    # 5) max unique categorical values
    mucv = max_unique_categorical_values(df, target_col=target_col)
    meta.update({
        "max_unique_values": int(mucv["max_unique_values"]),
        "max_unique_column": mucv["max_unique_column"]
    })

    # 6) sparsity percent
    meta["sparsity_percent"] = float(sparsity_percent(df, target_col=target_col))

    # 6b) adjusted sparsity percent (includes categorical features)
    meta["adjusted_sparsity_percent"] = float(adjusted_sparsity_percent(df, target_col=target_col))
    
    # 7) n_onehot_features
    meta["n_onehot_features"] = int(n_onehot_features(df, target_col=target_col))
    
    # 8) total_features_after_encoding
    meta["total_features_after_encoding"] = int(total_features_after_encoding(df, target_col=target_col))

    # 9) feature_cardinality_stats
    meta["feature_cardinality_stats"] = feature_cardinality_stats(df, target_col=target_col)

    # 10) n_categorical_features_gt50
    meta["n_categorical_features_gt50"] = n_categorical_features_gt50(df, target_col=target_col, threshold=50)

    # 11) max absolute correlation
    meta["max_absolute_correlation_inv"] = float(max_abs_corr(df, target_col=target_col))
    
    # 12) mean absolute correlation
    meta["mean_absolute_correlation_inv"] = float(mean_abs_corr(df, target_col=target_col))
    
    # 13) correlation redundancy
    meta["correlation_redundancy_inv"] = float(correlation_redundancy(df, target_col=target_col, threshold=0.95))

    # 14) Skipped

    # 15) Skipped

    # 16) mean class-wise mutual information (all features)
    meta["mean_classwise_mutual_information_inv"] = float(mean_mutual_info(df, target_col=target_col))

    # 17) mean class-wise mutual information (top-5 features)
    meta["top5_mean_classwise_mutual_information_inv"] = float(top5_mean_mutual_info(df, target_col=target_col))

    # 18) max class-wise mutual information (single feature)
    meta["max_classwise_mutual_information_inv"] = float(max_mutual_info(df, target_col=target_col))

    # 19) mean kurtosis
    meta["mean_kurtosis"] = float(mean_kurtosis(df, target_col=target_col))

    # 20) mean absolute skewness
    meta["mean_absolute_skewness"] = float(mean_abs_skewness(df, target_col=target_col))

    

    # Optional fields that are included:
    if file_path is not None:
        try:
            import os
            meta["file_size_bytes"] = os.path.getsize(file_path)
        except Exception:
            meta["file_size_bytes"] = None
    if infrastructure_info is not None:
        meta["infrastructure_info"] = infrastructure_info

    return meta



In [None]:
# -------------------------------------------
# 3. CYCLE THROUGH DATASETS AND POPULATE EXCEL
# -------------------------------------------

# For each dataset in the DATASETS_PATH folder, load it, compute the metadata and problexity metrics,
# and append to the manifest (Excel file).

from pathlib import Path
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler  # Add this at the top
import time
import problexity as px


# Define paths
DATASETS_PATH = Path(r"C:\Folder_Name_Here")    # Path to datasets
EXCEL_MANIFEST_PATH = Path(r"C:\Second_Folder_Here\filename.xlsx")    # Name the Excel file here


# Infrastructure descriptor is a column in the manifest (Excel file)
infrastructure_text = "Local Win 11 /Python 3.13 / 32GB RAM / Intel Core i7-14700F processor (20 cores, 28 logical processors, 2.10 GHz base speed), 32 GB of RAM"


column_order = [
    "timestamp",
    "dataset_name",
    "infrastructure_info",

    "n_samples",                             # 1. Number of Records
    "n_features",                            # 2. Number of Features
    "n_numeric_features",                    # 3. Number of Numerical Features
    "n_categorical_features",                # 4. Number of Categorical Features
    "max_unique_values",                     # 5. Maximum Number of Unique Categorical Values
    "max_unique_column",                     # 6. Maximum Number of Unique Column Name
    "n_onehot_features",                     # 7. One-hot Number of Features
    "total_features_after_encoding",         # 8. Total Number of Features After Encoding
    "feature_cardinality_stats",             # 9. Cardinality Summary
    "n_categorical_features_gt50",           # 10. Categorical Features > 50 Unique Values
    "class_imbalance",                       # 11. Class Imbalance Ratio
    "sparsity_percent",                      # 12. Sparsity Percent
    "adjusted_sparsity_percent",             # 13. Adjusted Sparsity Percent - include categoricals
    "max_absolute_correlation_inv",          # 14. Maximum Absolute Correlation Inverted
    "mean_absolute_correlation_inv",         # 15. Mean Absolute Correlation Inverted
    "correlation_redundancy_inv",            # 16. Correlation Redundancy Inverted
    "mean_mutual_information",               # 17. Mutual Information Top-10 Mean
    "mean_mutual_information_all",           # 18. Mean Mutual Information Score (All Pairs)
    "variance_near_zero_count",              # 19. Near-Zero Variance Count
    "mean_kurtosis",                         # 20. Mean Kurtosis
    "mean_absolute_skewness",                # 21. Mean Absolute Skewness
    "mean_classwise_mutual_information_inv",        # 22. Class-wise Mean Mutual Information Inverted
    "top5_mean_classwise_mutual_information_inv",   # 23. Class-wise Top-5 Mean Mutual Information Inverted
    "max_classwise_mutual_information_inv",         # 24. Class-wise Max Mutual Information Inverted
    "score",                                 # 25. Total Complexity Score (Score)
    "f1",                                   # 26. Maximum Fisher's Discriminant Ratio (F1)
    "f1v",                                   # 27. Directional Vector Maximum Fisher’s Discriminant Ratio (F1v)
    "f2",                                   # 28. Volume of Overlapping Region (F2)
    "f3",                                    # 29. Maximum Individual Feature Efficiency (F3)
    "f4",                                   # 30. Collective Feature Efficiency (F4)    
    "l1",                                   # 31. Sum of the Error Distance by Linear Programming (L1)
    "l2",                                   # 32. Error Rate of Linear Classifier (L2)
    "l3",                                   # 33. Non-linearity of Linear Classifier (L3)
    "n1",                                   # 34. Fraction of Borderline Points (N1)
    "n2",                                   # 35. Ratio of Intra/Extra NN Distance (N2)
    "n3",                                   # 36. Error Rate of NN classifier (N3)
    "n4",                                   # 37. Non linearity of NN (N4)
    "t1",                                   # 38. Fraction of Hyperspheres Covering Data (T1)
    "lsc",                                  # 39. Local Set Average Cardinality (LSC)
    "density",                              # 40. Density (Density)        
    "clsCoef",                              # 41. Clusering Coefficient (ClsCoef)
    "hubs",                                 # 42. Hubs (Hubs)
    "t2",                                   # 43. Average Number of Features Per Point (T2)
    "t3",                                   # 44. Average Number of PCA Dimensions per Points (T3)
    "t4",                                   # 45. Ratio of the PCA Dimension to the Original Dimension (T4)
    "c1",                                   # 46. Entropy of Classes Proportion (C1)
    "c2",                                   # 47. Imbalance Ratio (C2)
    "file_size_bytes",                      # 48. File Size
    "class_counts",                         # 49. Target Class Distribution
    "processing_time_seconds",              # 50. Time taken to process dataset
]

# Placeholder for results
manifest_rows = []
num_files = len(list(DATASETS_PATH.glob("*.csv")))
count = 0

# Loop through datasets
for dataset_file in DATASETS_PATH.glob("*.csv"):
    start_time = time.time()
    try:
        df = pd.read_csv(dataset_file)      

        # Separate features and target
        X = df.drop(columns=["target"])
        y = df["target"]

        # Drop non-numeric features beacause because problexity is incapabable of
        # recognizing and ignoring/dropping categorical features.
        X = X.select_dtypes(include=["number"])

        # Scale numeric features only. Reduces overflow issues for some 
        # dataset complexity measure calculations. Also, improves the actual measurement.
        numeric_cols = X.select_dtypes(include=["number"]).columns
        scaler = StandardScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

        # Recombine scaled features with target
        df_scaled = X.copy()
        df_scaled["target"] = y  

        # Extract metrics using fixed target column
        # For dataset_metadata function, pass the original df,containing all features.
        metadata = dataset_metadata(df, target_col="target")

        # Drop NaNs because problexity is incapable of dealing with missing values
        # For problexity, must use the df_scaled, which has only numeric features.
        df_scaled = df_scaled.dropna()

        # Cap dataset size to avoid memory overflow caused by problexity's inefficiencies.
        # This is stratified random sampling, ensuring each class gets an equal share of the total row cap.
        # To suppress FutureWarning messages, included group_keys=False, include_groups=True
        MAX_ROWS = 25000
        if len(df_scaled) > MAX_ROWS:
            df_scaled = (
                df_scaled.groupby("target", group_keys=False)
                .apply(lambda x: x.sample(min(len(x), MAX_ROWS // df_scaled["target"].nunique()), random_state=42))
                .reset_index(drop=True)
            )  


        # Run problexity on scaled data
        cc = px.ComplexityCalculator()
        cc.fit(df_scaled.drop(columns=["target"]), df_scaled["target"])
        report = cc.report()

        # Merge problexity metrics into metadata
        metadata["score"] = report["score"]
        metadata.update(report["complexities"])


        # Add admin fields
        metadata["timestamp"] = datetime.now().isoformat(timespec="seconds")
        metadata["dataset_name"] = dataset_file.stem
        metadata["infrastructure_info"] = infrastructure_text
        # Insert placeholder MI scores - MI calcs crash routine if too many features
        metadata["mean_mutual_information"] = 0.0
        metadata["mean_mutual_information_all"] = 0.0
        metadata["file_size_bytes"] = dataset_file.stat().st_size

        # Capture how long it took to develop the dataset metrics
        end_time = time.time()
        metadata["processing_time_seconds"] = round(end_time - start_time, 2)

        # Align to column order, and build the Excel row's dictionary
        row = {col: metadata.get(col, None) for col in column_order}      

        manifest_rows.append(row)
        count += 1
        print(f"✅ Appended {count} of {num_files} files: {dataset_file.name}")

    except Exception as e:
        print(f"⚠️ Skipped {dataset_file.name}: {e}")

# Create DataFrame from new rows
manifest_df = pd.DataFrame(manifest_rows)


# If Excel file exists, read existing sheet and append
if EXCEL_MANIFEST_PATH.exists():
    try:
        existing_df = pd.read_excel(EXCEL_MANIFEST_PATH, sheet_name="Benchmark Manifest")        
        combined_df = pd.concat([existing_df, manifest_df], ignore_index=True)
    except Exception as e:
        print(f"⚠️ Could not read existing manifest: {e}")
        combined_df = manifest_df
else:
    combined_df = manifest_df

# Reorder columns to match desired order
combined_df = combined_df[column_order]


# Overwrite the existing file (i.e., no appending), use default mode (write mode)
with pd.ExcelWriter(EXCEL_MANIFEST_PATH, engine="openpyxl") as writer:
    combined_df.to_excel(writer, sheet_name="Benchmark Manifest", index=False)


print("Manifest updated successfully.")