### Reading and checking the parquet Data

In [2]:
# read_refined_data.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, countDistinct, mean
from pyspark.sql.types import IntegerType, DoubleType, FloatType, LongType
import os

def get_spark_session(app_name):
    hadoop_path = "C:\\hadoop"
    os.environ['HADOOP_HOME'] = hadoop_path
    os.environ['PATH'] = f"{os.environ['PATH']};{hadoop_path}\\bin"
    
    return SparkSession.builder \
        .appName(app_name) \
        .config("spark.sql.parquet.compression.codec", "snappy") \
        .getOrCreate()

def show_basic_stats(df, dataset_name):
    """Mostra estat√≠sticas b√°sicas de forma organizada"""
    print(f"\nüîç An√°lise para {dataset_name.upper()}")
    
    # Contagens distintas
    for col_name in ['order_id', 'customer_id']:
        if col_name in df.columns:
            print(f"\nüìä IDs √∫nicos de {col_name.replace('_', ' ')}:")
            df.agg(countDistinct(col_name).alias(f"unique_{col_name}")).show()
    
    # M√©tricas num√©ricas
    numeric_cols = [f.name for f in df.schema.fields 
                   if isinstance(f.dataType, (IntegerType, DoubleType, FloatType, LongType))]
    
    if numeric_cols:
        print("\nüßÆ M√©tricas Num√©ricas:")
        stats_expr = []
        for col in numeric_cols:
            stats_expr.append(mean(col).alias(f"avg_{col}"))
            stats_expr.append(count(col).alias(f"count_{col}"))
        
        df.agg(*stats_expr).show(truncate=False)

def analyze_dataset(spark, dataset_name):
    BASE_DIR = os.path.dirname(os.path.abspath(os.getcwd()))
    INPUT_PATH = os.path.join(BASE_DIR, "data_lake", "refined", dataset_name)
    
    try:
        print(f"\nüì¶ Lendo {dataset_name}...")
        df = spark.read.parquet(INPUT_PATH)
        
        print("\nüìã Schema:")
        df.printSchema()
        
        print("\nüìÑ Amostra (5 linhas):")
        df.show(5, truncate=False)
        
        show_basic_stats(df, dataset_name)
        return df
        
    except Exception as e:
        print(f"‚ùå Erro em {dataset_name}: {str(e)}")
        return None

if __name__ == "__main__":
    spark = get_spark_session("AnalysisRefinedData")
    
    datasets = ["orders", "order_items", "order_payments"]
    
    for dataset in datasets:
        analyze_dataset(spark, dataset)
    
    spark.stop()
    print("\n‚úÖ An√°lise conclu√≠da para todos os datasets!")


üì¶ Lendo orders...

üìã Schema:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- approval_time_hours: double (nullable = true)
 |-- processing_time_hours: double (nullable = true)
 |-- shipping_time_hours: double (nullable = true)
 |-- total_delivery_time_hours: double (nullable = true)
 |-- delivery_delay_hours: double (nullable = true)
 |-- purchase_time_period: string (nullable = true)
 |-- status_category: string (nullable = true)
 |-- delivery_performance: string (nullable = true)


üìÑ Amostra (5 linhas):
+--------------------------------+--------------------------------+------------+--------

+---------------+
|unique_order_id|
+---------------+
|          99441|
+---------------+


üìä IDs √∫nicos de customer id:
+------------------+
|unique_customer_id|
+------------------+
|             99441|
+------------------+


üßÆ M√©tricas Num√©ricas:
+-----------------------+-------------------------+-------------------------+---------------------------+-----------------------+-------------------------+-----------------------------+-------------------------------+------------------------+--------------------------+
|avg_approval_time_hours|count_approval_time_hours|avg_processing_time_hours|count_processing_time_hours|avg_shipping_time_hours|count_shipping_time_hours|avg_total_delivery_time_hours|count_total_delivery_time_hours|avg_delivery_delay_hours|count_delivery_delay_hours|
+-----------------------+-------------------------+-------------------------+---------------------------+-----------------------+-------------------------+-----------------------------+----------------