In [0]:
# Databricks notebook source
# ============================================================================
# DATA GENERATION SCRIPT FOR ETL PIPELINE TESTING
# ============================================================================
# This script generates synthetic sales data in CSV and Parquet formats
# Run this notebook before executing the main ETL pipeline

import random
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, 
    DoubleType, DateType
)
import pandas as pd

print("=" * 80)
print("SYNTHETIC DATA GENERATOR")
print("=" * 80)

# COMMAND ----------

# ============================================================================
# CONFIGURATION
# ============================================================================

# Number of records to generate
NUM_RECORDS = 280000

# Output paths (matching the ETL notebook)
OUTPUT_CSV_PATH = f"/Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_csv/raw_sales_data_{datetime.now()}.csv"
OUTPUT_PARQUET_PATH = f"/Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_parquet/raw_sales_data_{datetime.now()}.parquet"

# Seed for reproducibility (optional)
random.seed(42)

print(f"Configuration:")
print(f"  - Records to generate: {NUM_RECORDS:,}")
print(f"  - CSV output: {OUTPUT_CSV_PATH}")
print(f"  - Parquet output: {OUTPUT_PARQUET_PATH}")

# COMMAND ----------

# ============================================================================
# DATA GENERATION PARAMETERS
# ============================================================================

# Define possible values for categorical fields
CATEGORIES = [
    "Electronics", "Clothing", "Home & Garden", "Sports", 
    "Books", "Toys", "Automotive", "Health & Beauty"
]

REGIONS = [
    "North America", "Europe", "Asia Pacific", "Latin America", 
    "Middle East", "Africa"
]

FIRST_NAMES = [
    "John", "Mary", "David", "Sarah", "Michael", "Emma", "James", 
    "Linda", "Robert", "Patricia", "William", "Jennifer", "Richard",
    "Elizabeth", "Thomas", "Maria", "Charles", "Susan", "Daniel", "Jessica"
]

LAST_NAMES = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", 
    "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", 
    "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor"
]

SALES_PERSONS = [
    "Alice Johnson", "Bob Smith", "Carol Davis", "Dan Brown",
    "Eve Wilson", "Frank Miller", "Grace Lee", "Henry Taylor"
]

PRODUCT_PREFIXES = {
    "Electronics": ["Laptop", "Smartphone", "Tablet", "Headphones", "Camera"],
    "Clothing": ["T-Shirt", "Jeans", "Jacket", "Shoes", "Dress"],
    "Home & Garden": ["Lamp", "Chair", "Table", "Curtains", "Plant"],
    "Sports": ["Basketball", "Soccer Ball", "Tennis Racket", "Yoga Mat", "Dumbbell"],
    "Books": ["Novel", "Textbook", "Cookbook", "Biography", "Guide"],
    "Toys": ["Action Figure", "Board Game", "Puzzle", "Doll", "Building Set"],
    "Automotive": ["Car Part", "Oil Filter", "Tire", "Battery", "Wiper"],
    "Health & Beauty": ["Shampoo", "Lotion", "Vitamin", "Perfume", "Soap"]
}

# Date range for transactions (last 2 years)
END_DATE = datetime(2026, 1, 31)
START_DATE = END_DATE - timedelta(days=730)

# COMMAND ----------

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def generate_transaction_id(index):
    """Generate unique transaction ID"""
    return f"TXN{str(index).zfill(8)}"

def generate_random_date(start_date, end_date):
    """Generate random date between start and end"""
    time_delta = end_date - start_date
    random_days = random.randint(0, time_delta.days)
    return (start_date + timedelta(days=random_days)).strftime("%Y-%m-%d")

def generate_customer_name():
    """Generate random customer name"""
    first = random.choice(FIRST_NAMES)
    last = random.choice(LAST_NAMES)
    return f"{first} {last}"

def generate_product_name(category):
    """Generate product name based on category"""
    prefix = random.choice(PRODUCT_PREFIXES[category])
    model = random.choice(["Pro", "Plus", "Standard", "Elite", "Basic"])
    number = random.randint(100, 999)
    return f"{prefix} {model} {number}"

def generate_unit_price(category):
    """Generate realistic price based on category"""
    price_ranges = {
        "Electronics": (50, 2000),
        "Clothing": (15, 200),
        "Home & Garden": (20, 500),
        "Sports": (10, 300),
        "Books": (5, 50),
        "Toys": (10, 100),
        "Automotive": (25, 500),
        "Health & Beauty": (5, 150)
    }
    min_price, max_price = price_ranges[category]
    return round(random.uniform(min_price, max_price), 2)

def introduce_data_quality_issues(data, issue_rate=0.05):
    """
    Introduce realistic data quality issues for testing
    - Missing values
    - Duplicates
    - Inconsistent formatting
    - Invalid values
    """
    # Introduce some null values (5% chance)
    if random.random() < issue_rate:
        field = random.choice(['customer_name', 'category', 'region', 'sales_person'])
        data[field] = None
    
    # Introduce inconsistent spacing/casing (3% chance)
    if random.random() < (issue_rate * 0.6):
        if data['customer_name']:
            data['customer_name'] = "  " + data['customer_name'].lower() + "  "
    
    # Introduce negative quantity (1% chance)
    if random.random() < (issue_rate * 0.2):
        data['quantity'] = -1 * random.randint(1, 5)
    
    return data

# COMMAND ----------

# ============================================================================
# GENERATE SYNTHETIC DATA
# ============================================================================

print("\nGenerating synthetic sales data...")

# List to store all records
sales_data = []

# Generate records
for i in range(1, NUM_RECORDS + 1):
    # Select random category
    category = random.choice(CATEGORIES)
    
    # Generate base record
    record = {
        "transaction_id": generate_transaction_id(i),
        "transaction_date": generate_random_date(START_DATE, END_DATE),
        "customer_id": f"CUST{str(random.randint(1, 5000)).zfill(6)}",
        "customer_name": generate_customer_name(),
        "product_id": f"PROD{str(random.randint(1, 1000)).zfill(5)}",
        "product_name": generate_product_name(category),
        "category": category,
        "quantity": random.randint(1, 10),
        "unit_price": generate_unit_price(category),
        "discount": round(random.choice([0, 0, 0, 0.05, 0.1, 0.15, 0.2, 0.25]), 2),
        "region": random.choice(REGIONS),
        "sales_person": random.choice(SALES_PERSONS)
    }
    
    # Introduce data quality issues in some records
    record = introduce_data_quality_issues(record, issue_rate=0.05)
    
    sales_data.append(record)
    
    # Progress indicator
    if i % 1000 == 0:
        print(f"  Generated {i:,} records...")

print(f"âœ“ Generated {NUM_RECORDS:,} records")

# Add some duplicate records (2% of total)
num_duplicates = int(NUM_RECORDS * 0.02)
duplicates = random.sample(sales_data, num_duplicates)
sales_data.extend(duplicates)
print(f"âœ“ Added {num_duplicates} duplicate records for quality testing")

# Shuffle the data
random.shuffle(sales_data)

# COMMAND ----------

# ============================================================================
# SAVE AS CSV
# ============================================================================

print(f"\nSaving data as CSV...")

# Convert to pandas DataFrame for easy CSV export
df_pandas = pd.DataFrame(sales_data)

# Reorder columns to match schema
column_order = [
    "transaction_id", "transaction_date", "customer_id", "customer_name",
    "product_id", "product_name", "category", "quantity", 
    "unit_price", "discount", "region", "sales_person"
]
df_pandas = df_pandas[column_order]

# Save to CSV
df_pandas.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"âœ“ CSV file saved successfully")
print(f"  - Path: {OUTPUT_CSV_PATH}")
print(f"  - Size: {len(sales_data):,} rows x {len(column_order)} columns")

# Display sample
print("\nSample of generated CSV data:")
print(df_pandas.head(10))

# COMMAND ----------

# ============================================================================
# SAVE AS PARQUET
# ============================================================================

print(f"\nSaving data as Parquet...")

# Define schema for Spark DataFrame
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("transaction_date", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("discount", DoubleType(), True),
    StructField("region", StringType(), True),
    StructField("sales_person", StringType(), True)
])

# Create Spark DataFrame
df_spark = spark.createDataFrame(sales_data, schema=schema)

# Save as Parquet
df_spark.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(OUTPUT_PARQUET_PATH)

print(f"âœ“ Parquet file saved successfully")
print(f"  - Path: {OUTPUT_PARQUET_PATH}")

# Display schema
print("\nParquet schema:")
df_spark.printSchema()

# COMMAND ----------

# ============================================================================
# DATA VALIDATION
# ============================================================================

print("\n" + "=" * 80)
print("DATA GENERATION SUMMARY")
print("=" * 80)

# Read back and validate CSV
df_csv_validate = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(OUTPUT_CSV_PATH)

# Read back and validate Parquet
df_parquet_validate = spark.read \
    .format("parquet") \
    .load(OUTPUT_PARQUET_PATH)

print(f"\nâœ“ CSV Validation:")
print(f"  - Rows: {df_csv_validate.count():,}")
print(f"  - Columns: {len(df_csv_validate.columns)}")

print(f"\nâœ“ Parquet Validation:")
print(f"  - Rows: {df_parquet_validate.count():,}")
print(f"  - Columns: {len(df_parquet_validate.columns)}")

# Data quality summary
print(f"\nâœ“ Data Quality Characteristics:")
print(f"  - Clean records: ~{int(NUM_RECORDS * 0.95):,}")
print(f"  - Records with issues: ~{int(NUM_RECORDS * 0.05):,}")
print(f"  - Duplicate records: ~{num_duplicates:,}")
print(f"  - Date range: {START_DATE.date()} to {END_DATE.date()}")
print(f"  - Categories: {len(CATEGORIES)}")
print(f"  - Regions: {len(REGIONS)}")

# Category distribution
print(f"\nâœ“ Category Distribution:")
category_counts = df_parquet_validate.groupBy("category").count().orderBy("count", ascending=False)
display(category_counts)

# COMMAND ----------

# ============================================================================
# FINAL MESSAGE
# ============================================================================

print("\n" + "=" * 80)
print("âœ… DATA GENERATION COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"\nGenerated files are ready for ETL pipeline:")
print(f"  ðŸ“„ CSV: {OUTPUT_CSV_PATH}")
print(f"  ðŸ“¦ Parquet: {OUTPUT_PARQUET_PATH}")
print(f"\nNext steps:")
print(f"  1. Run the main ETL notebook")
print(f"  2. The pipeline will read these files as source data")
print(f"  3. Data quality issues will be cleaned during transformation")


SYNTHETIC DATA GENERATOR
Configuration:
  - Records to generate: 280,000
  - CSV output: /Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_csv/raw_sales_data_2026-02-02 18:42:54.147842.csv
  - Parquet output: /Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_parquet/raw_sales_data_2026-02-02 18:42:54.147881.parquet

Generating synthetic sales data...
  Generated 1,000 records...
  Generated 2,000 records...
  Generated 3,000 records...
  Generated 4,000 records...
  Generated 5,000 records...
  Generated 6,000 records...
  Generated 7,000 records...
  Generated 8,000 records...
  Generated 9,000 records...
  Generated 10,000 records...
  Generated 11,000 records...
  Generated 12,000 records...
  Generated 13,000 records...
  Generated 14,000 records...
  Generated 15,000 records...
  Generated 16,000 records...
  Generated 17,000 records...
  Generated 18,000 records...
  Generated 19,000 records...
  

category,count
Automotive,35481
Health & Beauty,35472
Clothing,35357
Home & Garden,35279
Electronics,35210
Toys,35151
Sports,35115
Books,34991
,3544



âœ… DATA GENERATION COMPLETED SUCCESSFULLY

Generated files are ready for ETL pipeline:
  ðŸ“„ CSV: /Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_csv/raw_sales_data_2026-02-02 18:42:54.147842.csv
  ðŸ“¦ Parquet: /Volumes/workspace/portfolio_projects/volume_portfolio_projects/simple_etl_project_raw_data_parquet/raw_sales_data_2026-02-02 18:42:54.147881.parquet

Next steps:
  1. Run the main ETL notebook
  2. The pipeline will read these files as source data
  3. Data quality issues will be cleaned during transformation
