In [None]:
import os
import pandas as pd

def read_and_merge_parquet_files(directory):
    """
    Reads all parquet files from the specified directory and its subdirectories,
    merges them into a single DataFrame, and returns it.

    Parameters:
    directory (str): Path to the directory containing parquet files.

    Returns:
    DataFrame: A single DataFrame object representing concatenated data from all parquet files.
    """
    
    # Initialize an empty list to store the DataFrame objects
    dataframes = []

    # Function to find all files recursively
    def find_all_files(directory):
        all_files = []
        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                all_files.append(file_path)
        return all_files

    # Find all Parquet files in the directory structure
    all_files = find_all_files(directory)

    print(f"\nTotal number of files found: {len(all_files)}")

    if not all_files:
        print("\nNo files found in any directory.")
        return None

    print(f"\nFound {len(all_files)} files in {directory}")

    # Process files in batches
    batch_size = 10
    for i in range(0, len(all_files), batch_size):
        try:
            batch_files = all_files[i:i + batch_size]
            batch = [pd.read_parquet(filepath) for filepath in batch_files]
            dataframes.extend(batch)
            print(f"Batch {i//batch_size + 1} processed.")
        except Exception as e:
            print(f"Error processing batch: {str(e)}")

    # Concatenate all DataFrames
    result_df = pd.concat(dataframes, ignore_index=True)

    # Display information about the resulting DataFrame
    print("\nDataFrame Information:")
    print(result_df.info())
    print(result_df.head())
    print(result_df.tail())

    # Print some statistics
    print("\nData Statistics:")
    print(result_df.describe())

    return result_df

# Usage
directory = '/mnt/f/parquet/ZZ/FWU1/HHE.D'
merged_dataframe = read_and_merge_parquet_files(directory)

if merged_dataframe is not None:
    print("Merged DataFrame shape:", merged_dataframe.shape)
else:
    print("Failed to merge DataFrames.")


Total number of files found: 0

No files found in any directory.
Failed to merge DataFrames.
