# Generate Imbalanced Dataset

Generates synthetic imbalanced classification datasets for XGBoost scaling experiments.

**Parameters** (via widgets or job params):
- `env`: Environment name (dev/prod)
- `run_mode`: `full` or `smoke` (smoke uses tiny data for quick validation)
- `json_params`: JSON string with additional config overrides

## Setup Widgets

In [None]:
# Widget definitions - these can be overridden by job parameters
dbutils.widgets.text("env", "dev", "Environment")
dbutils.widgets.dropdown("run_mode", "full", ["full", "smoke"], "Run Mode")
dbutils.widgets.text("json_params", "{}", "JSON Parameters")

# Catalog/schema widgets (can be set by job or bundle variables)
dbutils.widgets.text("catalog", "brian_gen_ai", "Catalog")
dbutils.widgets.text("schema", "xgb_scaling", "Schema")

## Import and Parse Parameters

In [None]:
import sys
import time

# Add src to path for local imports
# When deployed via DAB, the repo files are synced to workspace
import os
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
repo_root = "/".join(notebook_path.split("/")[:-2])  # Go up from /notebooks/notebook_name
sys.path.insert(0, f"/Workspace{repo_root}")

# Import core logic
from src.main import run, build_exit_result
from src.config import DatasetConfig

# Get widget values
env = dbutils.widgets.get("env")
run_mode = dbutils.widgets.get("run_mode")
json_params = dbutils.widgets.get("json_params")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

# Parse parameters and get config
config = run(
    env=env,
    run_mode=run_mode,
    json_params=json_params,
    catalog=catalog,
    schema=schema,
)

## Generate Dataset

In [None]:
from pyspark.sql.functions import rand, randn, when, col, lit
from pyspark.sql.types import FloatType, IntegerType

def generate_imbalanced_dataset(
    spark,
    total_rows: int,
    n_features: int,
    n_informative: int,
    minority_ratio: float,
    seed: int,
):
    """
    Generate large imbalanced classification dataset using Spark.
    
    - First n_informative features are correlated with target
    - Remaining features are random noise
    - Label is imbalanced with minority_ratio as positive class proportion
    """
    print(f"Generating: {total_rows:,} rows x {n_features} features")
    print(f"  Informative features: {n_informative}")
    print(f"  Minority ratio: {minority_ratio:.1%}")
    print()
    
    # Base dataframe with row IDs
    df = spark.range(0, total_rows)
    
    # Imbalanced label (1 = minority class)
    df = df.withColumn(
        "label",
        when(rand(seed) < minority_ratio, lit(1)).otherwise(lit(0)).cast(IntegerType())
    )
    
    # Add features
    for i in range(n_features):
        feature_seed = seed + i + 1
        
        if i < n_informative:
            # Informative: correlated with label
            weight = 0.5 + (i % 10) * 0.15
            df = df.withColumn(
                f"f{i}",
                (randn(feature_seed) + col("label") * lit(weight)).cast(FloatType())
            )
        else:
            # Noise: pure random
            df = df.withColumn(f"f{i}", randn(feature_seed).cast(FloatType()))
        
        # Progress logging
        if (i + 1) % 100 == 0:
            print(f"  Added {i + 1}/{n_features} features...")
    
    # Reorder: features first, then label
    feature_cols = [f"f{i}" for i in range(n_features)]
    return df.select(feature_cols + ["label"])

In [None]:
# Generate the dataset
start_time = time.time()

df = generate_imbalanced_dataset(
    spark=spark,
    total_rows=config.total_rows,
    n_features=config.n_features,
    n_informative=config.n_informative,
    minority_ratio=config.minority_ratio,
    seed=config.seed,
)

generation_time = time.time() - start_time
print(f"\nDataFrame created in {generation_time:.1f}s (lazy - not materialized yet)")

## Write to Delta Table

In [None]:
print(f"Writing to: {config.output_table}")

write_start = time.time()

df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(config.output_table)

write_time = time.time() - write_start
total_time = time.time() - start_time

print(f"Write completed in {write_time:.1f}s")
print(f"Total time: {total_time:.1f}s ({total_time / 60:.1f} minutes)")

## Validate Results

In [None]:
# Read back and validate
df_check = spark.table(config.output_table)

# Row count
row_count = df_check.count()
print(f"Rows written: {row_count:,}")
print(f"Expected:     {config.total_rows:,}")
print(f"Match: {row_count == config.total_rows}")

In [None]:
# Class distribution
label_counts = df_check.groupBy("label").count().orderBy("label").collect()

print("\nClass distribution:")
class_distribution = {}
for row in label_counts:
    label = row["label"]
    count = row["count"]
    pct = count / row_count * 100
    class_name = "Minority" if label == 1 else "Majority"
    print(f"  Label {label} ({class_name}): {count:,} ({pct:.2f}%)")
    class_distribution[label] = count

if len(label_counts) == 2:
    print(f"\nImbalance ratio: {label_counts[0]['count'] / label_counts[1]['count']:.1f}:1")

In [None]:
# Quick sample
print("Sample data (first 5 rows, first 5 features + label):")
sample_cols = [f"f{i}" for i in range(min(5, config.n_features))] + ["label"]
df_check.select(sample_cols).show(5, truncate=False)

## Exit with Result

In [None]:
# Build result for job output
result_json = build_exit_result(
    config=config,
    status="ok",
    row_count=row_count,
    duration_seconds=total_time,
    class_distribution=class_distribution,
)

print(f"\nNotebook result:")
print(result_json)

# Exit with JSON result (fetchable via Databricks API)
dbutils.notebook.exit(result_json)