# Gerador de Dados de Teste PySpark (IoT)
Este notebook gera volumes de dados (**Small, Medium, Large**) otimizados para clusters com pouca memória RAM (ex: nós de 8GB), utilizando a técnica de escrita em chunks para evitar erros de OOM (Out of Memory).

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, rand, randn, expr, 
    current_timestamp,
    array, when,
    monotonically_increasing_id
)

# 1. Inicialização do Spark
spark = (SparkSession.builder
         .appName("DataGenerator-TCC-Parquet")
         .config("spark.sql.shuffle.partitions", "128")
         .config("spark.sql.adaptive.enabled", "true")
         .getOrCreate())

In [None]:
# 2. Configurações de Storage
storage_account = "tccprojectdlstorage"
base_path = f"abfss://source@{storage_account}.dfs.core.windows.net"
temp_path = f"{base_path}/_temp_calibration"

In [None]:
# 3. Definição do Schema e Gerador
def generate_data(num_rows):
    """Gera um DataFrame com o schema do pipeline de IoT."""
    
    df = spark.range(num_rows).withColumnRenamed("id", "event_id")
    
    df = df.withColumn("device_id", expr("uuid()"))
    
    df = df.withColumn("device_type", 
                       array(lit('type-A'), lit('type-B'), lit('type-C'))[(rand() * 3).cast('int')])
    
    df = df.withColumn("location_id", 
                       (rand() * 100).cast('int').cast('string'))
    
    df = df.withColumn("event_ts", 
                       (current_timestamp().cast('long') - (rand() * 100000000)).cast('timestamp'))
    
    df = df.withColumn("temperature", (randn() * 15) + 25)
    df = df.withColumn("pressure", (randn() * 50) + 101325)
    df = df.withColumn("energy_consumption", rand() * 10)
    df = df.withColumn("battery_level", rand() * 100)
    
    df = df.withColumn("status_code", 
                       when(rand() < 0.9, "OK").otherwise("ERROR"))
    
    df = df.withColumn("tags", 
                       when(rand() < 0.1, "critical,high").otherwise("normal,low"))
    
    df = df.withColumn("payload", 
                       when(rand() < 0.5, "fw=1.2.3;location_group=A;device_group=1")
                       .otherwise("fw=1.2.4;location_group=B;device_group=2"))

    return df

In [None]:
# 4. Funções Utilitárias de Sistema de Arquivos
def get_fs_path_size(path_str):
    try:
        fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
            spark._jvm.java.net.URI.create(path_str),
            spark._jsc.hadoopConfiguration()
        )
        path_jvm = spark._jvm.org.apache.hadoop.fs.Path(path_str)
        size_bytes = fs.getContentSummary(path_jvm).getLength()
        return size_bytes
    except Exception as e:
        print(f"Erro ao obter tamanho do {path_str}: {e}")
        return 0

def cleanup(path_str):
    print(f"Limpando diretório: {path_str}")
    try:
        fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
            spark._jvm.java.net.URI.create(path_str),
            spark._jsc.hadoopConfiguration()
        )
        path_jvm = spark._jvm.org.apache.hadoop.fs.Path(path_str)
        if fs.exists(path_jvm):
            fs.delete(path_jvm, True)
    except Exception as e:
        print(f"Falha ao limpar {path_str}: {e}")

In [None]:
# 5. Calibração
print("Iniciando calibração para estimar o tamanho dos dados...")
CALIBRATION_ROWS = 1_000_000
GB = 1024 * 1024 * 1024
MB = 1024 * 1024

TARGET_FILE_SIZE_MB = 128 
TARGET_CHUNK_GB = 1       

df_sample = generate_data(CALIBRATION_ROWS)
df_sample.repartition(1).write.mode("overwrite").parquet(temp_path)

size_bytes = get_fs_path_size(temp_path)
size_per_row = size_bytes / CALIBRATION_ROWS

rows_per_chunk = int((TARGET_CHUNK_GB * GB) / size_per_row)
partitions_per_chunk = int((TARGET_CHUNK_GB * GB) / (TARGET_FILE_SIZE_MB * MB))
partitions_per_chunk = max(1, partitions_per_chunk)

print(f"Calibração concluída: {size_per_row:.8f} bytes/linha.")
print(f"Cada chunk de 1GB terá ~{rows_per_chunk:,} linhas.")

In [None]:
# 6. Execução da Geração
scales = [
    ("small", 10), 
    ("medium", 40), 
    ("large", 100)
]

for scale_name, total_gb in scales:
    output_path = f"{base_path}/{scale_name}"
    cleanup(output_path)
    
    print(f"\n--- Iniciando Geração para '{scale_name}' ({total_gb} GB) ---")
    
    for i in range(total_gb):
        print(f"Gerando chunk {i+1} de {total_gb}...")
        df_chunk = generate_data(rows_per_chunk)
        df_chunk = df_chunk.repartition(partitions_per_chunk)
        df_chunk.write.mode("append").parquet(output_path)

cleanup(temp_path)
print("--- GERAÇÃO DE DADOS CONCLUÍDA ---")