# Generate Large-Scale Imbalanced Dataset for XGBoost Scaling

Generates a synthetic imbalanced classification dataset:
- **64,050,659 rows**
- **500 features**
- **~2% minority class**

Uses PySpark for distributed generation. Writes to Unity Catalog Delta table.

## Configuration

In [None]:
# Dataset parameters
TOTAL_ROWS = 64_050_659
N_FEATURES = 500
N_INFORMATIVE = 80
MINORITY_RATIO = 0.02
SEED = 42

# Output - Unity Catalog table
CATALOG = "your_catalog"
SCHEMA = "your_schema"
TABLE_NAME = "imbalanced_64m"

OUTPUT_TABLE = f"{CATALOG}.{SCHEMA}.{TABLE_NAME}"

## Generate Dataset

In [None]:
from pyspark.sql.functions import rand, randn, when, col, lit
from pyspark.sql.types import FloatType, IntegerType


def generate_imbalanced_dataset(
    total_rows: int,
    n_features: int,
    n_informative: int,
    minority_ratio: float,
    seed: int,
):
    """
    Generate large imbalanced classification dataset.
    
    - First n_informative features are correlated with target
    - Remaining features are noise
    """
    # Base dataframe
    df = spark.range(0, total_rows)
    
    # Imbalanced label
    df = df.withColumn(
        "label",
        when(rand(seed) < minority_ratio, lit(1)).otherwise(lit(0)).cast(IntegerType())
    )
    
    # Add features
    for i in range(n_features):
        feature_seed = seed + i + 1
        
        if i < n_informative:
            # Informative: correlated with label
            weight = 0.5 + (i % 10) * 0.15
            df = df.withColumn(
                f"f{i}",
                (randn(feature_seed) + col("label") * lit(weight)).cast(FloatType())
            )
        else:
            # Noise
            df = df.withColumn(f"f{i}", randn(feature_seed).cast(FloatType()))
        
        if (i + 1) % 100 == 0:
            print(f"  Added {i + 1}/{n_features} features...")
    
    # Reorder: features first, then label
    feature_cols = [f"f{i}" for i in range(n_features)]
    return df.select(feature_cols + ["label"])


print(f"Generating: {TOTAL_ROWS:,} rows x {N_FEATURES} features")
print(f"Minority class: {MINORITY_RATIO:.1%}")
print()

df = generate_imbalanced_dataset(
    total_rows=TOTAL_ROWS,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    minority_ratio=MINORITY_RATIO,
    seed=SEED,
)

## Write to Delta Table

In [None]:
print(f"Writing to: {OUTPUT_TABLE}")

df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(OUTPUT_TABLE)

print("Done!")

## Validate

In [None]:
df_check = spark.table(OUTPUT_TABLE)

print(f"Rows: {df_check.count():,}")
print(f"Columns: {len(df_check.columns)}")

In [None]:
# Class distribution
df_check.groupBy("label").count().orderBy("label").show()

In [None]:
# Quick sample
sample_cols = [f"f{i}" for i in range(5)] + ["label"]
df_check.select(sample_cols).show(5)

In [None]:
# Verify informative vs noise features
from pyspark.sql.functions import mean

print("Mean by label (informative f0 should differ, noise f400 should not):")
df_check.groupBy("label").agg(
    mean("f0").alias("f0_mean"),
    mean("f400").alias("f400_mean")
).show()