In [0]:
%python
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import col

# Function to process Parquet files with schema unification
def process_parquet_files(spark, path: str, target_schema: StructType) -> DataFrame:
    # Read the directory 
    temp_df = spark.read.option("recursiveFileLookup", "true").parquet(path)
    
    # Get the list of all Parquet files 
    files = temp_df.inputFiles()  # Returns a list of full file paths
#     print(f"Found files in {path}:")
#     print(files)

    # Process each file individually
    dataframes = []
    for file in files:
        print(f"Processing file: {file}")
        
        # Read the file 
        df = spark.read.parquet(file)

        select_expressions = []
        for field in target_schema:
            column_name = field.name
            column_type = field.dataType.simpleString()
            
            if column_name in df.columns:
                # Если колонка существует - приводим к нужному типу
                select_expressions.append(col(column_name).cast(column_type).alias(column_name))
            else:
                # Если колонки нет - создаем с null значениями
                select_expressions.append(lit(None).cast(column_type).alias(column_name))

        # Применяем все изменения одним select
        df = df.select(*select_expressions)
        dataframes.append(df)

        # # Cast columns to match the target schema
        # for field in target_schema:
        #     column_name = field.name
        #     column_type = field.dataType.simpleString()  
            
        #     # Check if the column exists in the file
        #     if column_name in df.columns:
        #         df = df.withColumn(column_name, col(column_name).cast(column_type))
        #     else:
        #         # Add missing columns with null values, cast to the expected type
        #         df = df.withColumn(column_name, col(column_name).cast(column_type).alias(column_name))
        
        # # Add the unified DataFrame for the file to the list
        # dataframes.append(df)

    # Combine all DataFrames
    final_df = dataframes[0]
    for df in dataframes[1:]:
        final_df = final_df.unionByName(df)  # Align columns by name
    
    return final_df