Scale and encode the datasets obtained online.
Then save the processed datasets as CSV files.
One dataset is worked on at a time.  

In [None]:
# -------------------------------------------
# 1. LOAD LIBRARIES AND CONFIG
# -------------------------------------------

import os
import pandas as pd
import sys
import importlib
from pathlib import Path
import json


# Ensure config module is accessible and up-to-date
sys.path.insert(0, "C:/Misc/binary_eval")
import config
importlib.reload(config)

# Load config paths
from config import PROCESSED_DATA_DIR, METADATA_DIR

In [None]:
# -------------------------------------------
# 2. LOAD ONE DATASET
# -------------------------------------------

# Define the dataset filename to process
raw_filename = "dataset1.csv"  # Replace with actual filename

# Define paths
DATASETS_FOR_BENCHMARK = "D:\\Datasets"    # UPDATE THIS PATH AS NEEDED -----------

# Construct full path to the dataset
raw_path = os.path.join(DATASETS_FOR_BENCHMARK, raw_filename)

# Load dataset
try:
    df = pd.read_csv(raw_path)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå File not found: {raw_path}")
except pd.errors.ParserError:
    raise ValueError(f"‚ùå Parsing error while reading: {raw_path}")

# Basic metadata
num_rows, num_cols = df.shape
num_numeric = df.select_dtypes(include=["number"]).shape[1]
num_categorical = df.select_dtypes(include=["object", "category", "bool"]).shape[1]
has_target = "target" in df.columns

# Confirmatory prints
print(f"Loaded dataset: {raw_filename}")
print(f"Shape: {num_rows:,} rows √ó {num_cols:,} columns")
print(f"Numeric features: {num_numeric}")
print(f"Categorical features: {num_categorical}")
print(f"{'‚úÖ' if has_target else '‚ùå'} 'target' column {'found' if has_target else 'missing'} in dataset")

summary_stats = {}

summary_stats['raw_filename'] = raw_filename
summary_stats['pipeline_stage'] = "04_scale_encode_export"
summary_stats['original_row_count'] = num_rows
summary_stats['final_row_count'] = num_rows  # No rows dropped yet
summary_stats['rows_dropped'] = 0
summary_stats['percent_dropped'] = 0.0
summary_stats['total_numeric_features'] = num_numeric
summary_stats['total_categorical_features'] = num_categorical

In [None]:
# -------------------------------------------
# 3. CHECK FOR TARGET
# -------------------------------------------

# Validate "target" column exists and is binary (0 or 1)

if "target" not in df.columns:
    raise ValueError("‚ùå 'target' column is missing from the dataset.")

# Check unique values in target
target_values = df["target"].dropna().unique()
target_set = set(target_values)

# Confirm binary status
if target_set == {0, 1} or target_set == {1, 0}:
    print("‚úÖ 'target' column is binary (0 and 1).")
elif len(target_set) == 1 and target_set.issubset({0, 1}):
    print(f"‚ö†Ô∏è 'target' column contains only one class: {target_set}. Still binary, but may not be useful for modeling.")
else:
    raise ValueError(f"‚ùå 'target' column contains unexpected values: {target_set}")

In [None]:
# -------------------------------------------
# 4. DROP ROWS WITH MISSING VALUES
# -------------------------------------------

initial_row_count = df.shape[0]
df.dropna(inplace=True)
final_row_count = df.shape[0]
rows_dropped = initial_row_count - final_row_count
percent_dropped = (rows_dropped / initial_row_count) * 100 if initial_row_count > 0 else 0

# Display message
if rows_dropped > 0:
    print(f"Dropped {rows_dropped:,} rows with missing values "
          f"({percent_dropped:.2f}% of total).")
else:
    print("‚úÖ No missing values found. No rows dropped.")

# Log to summary dictionary

summary_stats['rows_dropped'] = rows_dropped
summary_stats['percent_dropped'] = round(percent_dropped, 2)
summary_stats['final_row_count'] = final_row_count 

In [None]:
# -------------------------------------------
# 5. SCALE NUMERIC FEATURES
# -------------------------------------------

from sklearn.preprocessing import StandardScaler

# Identify numeric columns (excluding 'target')
numeric_cols = df.select_dtypes(include=["number"]).columns.drop("target", errors="ignore")

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns

# Initialize scaler
scaler = StandardScaler()


# First check if there are any numeric features to scale
# If none, skip all scaling and give message.
if len(numeric_cols) > 0:
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print(f"Scaled {len(numeric_cols)} numeric feature(s) using StandardScaler.")
else:
    print("No numeric columns found to scale.")



print(f"Scaled {len(numeric_cols)} numeric feature(s) using StandardScaler.")
print(f"Left {len(categorical_cols)} categorical feature(s) and the 'target' column untouched.")

summary_stats['total_numeric_features'] = len(numeric_cols)
summary_stats['numeric_features_scaled'] = len(numeric_cols)
summary_stats['total_categorical_features'] = len(categorical_cols)
summary_stats['scaling_method'] = "StandardScaler"

In [None]:
# -------------------------------------------
# 6. PRE-ENCODING DIAGNOSTIC ‚Äî CATEGORICAL EXPANSION CHECK
# -------------------------------------------

# Estimate column expansion for each categorical feature
print("üîç Pre-encoding diagnostic ‚Äî estimated column expansion per categorical feature:\n")

for col in categorical_cols:
    n_unique = df[col].nunique(dropna=False)
    print(f"‚Ä¢ '{col}': {n_unique} unique value(s) ‚Üí {n_unique} potential column(s)")

In [None]:
# -------------------------------------------
# 7. DELETE SELECTED FEATURES FROM PRE-ENCODED DATAFRAME
# -------------------------------------------

# Provides a means to delete unneeded features before encoding (e.g., a sequentially assigned ID)
# Of course, simply removing them mannually from the dataset also works.

# Paste in the list of features to delete.
features_to_delete = []  # E.g., ['feature1', 'feature2']

# Confirm which features are present before deletion
print("Features before deletion:")
print(df.columns.tolist())

# Drop specified features
df = df.drop(columns=features_to_delete, errors='ignore')

# Confirm deletion
print("\nDeleted features:")
print(features_to_delete)

print("\nRemaining features:")
print(df.columns.tolist())

# Update summary_stats with deleted features
summary_stats['features_deleted'] = features_to_delete
summary_stats['num_features_deleted'] = len(features_to_delete)

In [None]:
# -------------------------------------------
# 8. ENCODE CATEGORICAL FEATURES
# -------------------------------------------

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# Dictionary to track number of new columns per feature
encoding_summary = {}

# Encode each categorical feature individually
for col in categorical_cols:
    encoded = pd.get_dummies(df[col], prefix=col, drop_first=False)
    df.drop(columns=[col], inplace=True)
    df = pd.concat([df, encoded], axis=1)
    encoding_summary[col] = encoded.shape[1]
    print(f"üî£ Encoded '{col}' ‚Üí {encoded.shape[1]} new column(s)")

# Total new columns added
total_new_columns = sum(encoding_summary.values())
print(f"\nüßæ Total new columns created from encoding: {total_new_columns:,}")

# Track encoded categorical features for downstream use
encoded_categorical_features = list(encoding_summary.keys())

# Final output: copy-paste friendly list of encoded features
print("\nüìã Encoded categorical features (for deletion or review):")
print(", ".join(f'"{feature}"' for feature in encoded_categorical_features))

print(f"\nThe number of remaining numeric features: {len(numeric_cols)}")

# Print remaining numeric feature names (max 8 per row)
print("\nüî¢ Remaining numeric features:")
for i in range(0, len(numeric_cols), 8):
    print(", ".join(numeric_cols[i:i+8]))

# Update summary_stats with encoding details
summary_stats['encoded_categorical_features'] = encoded_categorical_features
summary_stats['total_encoded_columns'] = total_new_columns
summary_stats['remaining_numeric_features'] = numeric_cols.tolist()
summary_stats['num_remaining_numeric_features'] = len(numeric_cols)
summary_stats['encoding_method'] = "OneHot (drop_first=False)"

In [None]:
# -------------------------------------------
# 10. BUILD SUMMARY STATISTICS DICTIONARY IN PREFERRED ORDER
# -------------------------------------------

pipeline_stage = "04_scale_encode_export"

summary_stats = {
    "raw_filename": raw_filename,
    "pipeline_stage": pipeline_stage,
    "original_row_count": num_rows,
    "final_row_count": final_row_count,
    "rows_dropped": rows_dropped,
    "percent_dropped": percent_dropped,
    "total_numeric_features": len(numeric_cols),
    "numeric_features_scaled": len(numeric_cols),
    "scaling_method": "StandardScaler",
    "total_categorical_features": len(categorical_cols),
    "encoding_method": "OneHot (drop_first=False)",
    "encoded_categorical_features": encoded_categorical_features,
    "total_encoded_columns": total_new_columns,
    "features_deleted": features_to_delete,
    "num_features_deleted": len(features_to_delete),
    "remaining_numeric_features": numeric_cols.tolist(),
    "num_remaining_numeric_features": len(numeric_cols)
}

In [None]:
# -------------------------------------------
# 9. SAVE OUTPUTS FOR USE BY NEXT MODULE
# -------------------------------------------

# 1. Save transformed dataframe as Parquet
transformed_path = PROCESSED_DATA_DIR / f"{raw_filename.replace('.csv', '')}_transformed.parquet"

df.to_parquet(transformed_path)
print(f"Saved transformed dataframe to: {transformed_path}")

# 2. Save transformation metadata as JSON
metadata_path = METADATA_DIR / f"{raw_filename.replace('.csv', '')}_transform_metadata.json"
with open(metadata_path, "w") as f:
    json.dump(summary_stats, f, indent=2)
print(f"Saved transformation metadata to: {metadata_path}")

# 3. Print new shape and column count
print(f"\nTransformed shape: {df.shape}")
print(f"Total columns after transform: {df.shape[1]:,}")
