In [None]:
import pandas as pd

def read_and_merge_parquet_files(directory):
    """
    Reads all parquet files from the specified directory and its subdirectories,
    merges them into a single DataFrame, and returns it.

    Parameters:
    directory (str): Path to the directory containing parquet files.

    Returns:
    DataFrame: A single DataFrame object representing concatenated data from all parquet files.
    """
    
    # Initialize an empty list to store the DataFrame objects
    dataframes = []

    # Function to find all files recursively
    def find_all_files(directory):
        all_files = []
        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                all_files.append(file_path)
        return all_files

    # Find all Parquet files in the directory structure
    all_files = find_all_files(directory)

    print(f"\nTotal number of files found: {len(all_files)}")

    if not all_files:
        print("\nNo files found in any directory.")
        return None

    print(f"\nFound {len(all_files)} files in {directory}")

    # Process files in batches
    batch_size = 10
    for i in range(0, len(all_files), batch_size):
        try:
            batch_files = all_files[i:i + batch_size]
            batch = [pd.read_parquet(filepath) for filepath in batch_files]
            dataframes.extend(batch)
            print(f"Batch {i//batch_size + 1} processed.")
        except Exception as e:
            print(f"Error processing batch: {str(e)}")

    # Concatenate all DataFrames
    result_df = pd.concat(dataframes, ignore_index=True)

    # Display information about the resulting DataFrame
    print("\nDataFrame Information:")
    print(result_df.info())
    print(result_df.head())
    print(result_df.tail())
    

    # Print some statistics
    print("\nData Statistics:")
    print(result_df.describe())

    return result_df.to_csv('merged_seismic_data2.csv')

# Usage
directory = '/notebooks/Mine-folder/output/FWU3/HHE.D_Partial'
merged_dataframe = read_and_merge_parquet_files(directory)

if merged_dataframe is not None:
    print("Merged DataFrame shape:", merged_dataframe.shape)
else:
    print("Failed to merge DataFrames.")

In [3]:
 import os
import pandas as pd

def read_and_merge_parquet_files(directory):
    """
    Reads all parquet files from the specified directory and its subdirectories,
    merges them into a single DataFrame, and returns it.

    Parameters:
    directory (str): Path to the directory containing parquet files.

    Returns:
    DataFrame: A single DataFrame object representing concatenated data from all parquet files.
    """
    
    # Initialize an empty list to store the DataFrame objects
    dataframes = []

    # Function to find all files recursively
    def find_all_files(directory):
        all_files = []
        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                all_files.append(file_path)
        return all_files

    # Find all Parquet files in the directory structure
    all_files = find_all_files(directory)

    print(f"\nTotal number of files found: {len(all_files)}")

    if not all_files:
        print("\nNo files found in any directory.")
        return None

    print(f"\nFound {len(all_files)} files in {directory}")

    # Process files in batches
    batch_size = 10
    for i in range(0, len(all_files), batch_size):
        try:
            batch_files = all_files[i:i + batch_size]
            batch = [pd.read_parquet(filepath) for filepath in batch_files]
            dataframes.extend(batch)
            print(f"Batch {i//batch_size + 1} processed.")
        except Exception as e:
            print(f"Error processing batch: {str(e)}")

    # Concatenate all DataFrames
    result_df = pd.concat(dataframes, ignore_index=True)

    # Display information about the resulting DataFrame
    print("\nDataFrame Information:")
    print(result_df.info())
    print(result_df.head())
    print(result_df.tail())
    

    # Print some statistics
    print("\nData Statistics:")
    print(result_df.describe())

    return result_df.to_csv('merged_seismic_data1.csv')

# Usage
directory = '/notebooks/Mine-folder/output/FWU3/HHE.D'
merged_dataframe = read_and_merge_parquet_files(directory)

if merged_dataframe is not None:
    print("Merged DataFrame shape:", merged_dataframe.shape)
else:
    print("Failed to merge DataFrames.")


Total number of files found: 33

Found 33 files in /notebooks/Mine-folder/output/FWU3/HHE.D
Batch 1 processed.
Batch 2 processed.
Batch 3 processed.
Batch 4 processed.

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   network        33 non-null     object 
 1   station        33 non-null     object 
 2   location       33 non-null     object 
 3   channel        33 non-null     object 
 4   starttime      33 non-null     object 
 5   endtime        33 non-null     object 
 6   sampling_rate  33 non-null     float64
 7   data           33 non-null     object 
dtypes: float64(1), object(7)
memory usage: 2.2+ KB
None
  network station location channel            starttime  \
0      ZZ    FWU3       10     HHE  2019-12-26T00:00:00   
1      ZZ    FWU3       10     HHE  2019-11-02T00:00:00   
2      ZZ    FWU3       10     

In [4]:
 import os
import pandas as pd

def read_and_merge_parquet_files(directory):
    """
    Reads all parquet files from the specified directory and its subdirectories,
    merges them into a single DataFrame, and returns it.

    Parameters:
    directory (str): Path to the directory containing parquet files.

    Returns:
    DataFrame: A single DataFrame object representing concatenated data from all parquet files.
    """
    
    # Initialize an empty list to store the DataFrame objects
    dataframes = []

    # Function to find all files recursively
    def find_all_files(directory):
        all_files = []
        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                all_files.append(file_path)
        return all_files

    # Find all Parquet files in the directory structure
    all_files = find_all_files(directory)

    print(f"\nTotal number of files found: {len(all_files)}")

    if not all_files:
        print("\nNo files found in any directory.")
        return None

    print(f"\nFound {len(all_files)} files in {directory}")

    # Process files in batches
    batch_size = 10
    for i in range(0, len(all_files), batch_size):
        try:
            batch_files = all_files[i:i + batch_size]
            batch = [pd.read_parquet(filepath) for filepath in batch_files]
            dataframes.extend(batch)
            print(f"Batch {i//batch_size + 1} processed.")
        except Exception as e:
            print(f"Error processing batch: {str(e)}")

    # Concatenate all DataFrames
    result_df = pd.concat(dataframes, ignore_index=True)

    # Display information about the resulting DataFrame
    print("\nDataFrame Information:")
    print(result_df.info())
    print(result_df.head())
    print(result_df.tail())
    

    # Print some statistics
    print("\nData Statistics:")
    print(result_df.describe())

    return result_df.to_csv('merged_seismic_data2.csv')

# Usage
directory = '/notebooks/Mine-folder/output/FWU3/HHE.D_Partial'
merged_dataframe = read_and_merge_parquet_files(directory)

if merged_dataframe is not None:
    print("Merged DataFrame shape:", merged_dataframe.shape)
else:
    print("Failed to merge DataFrames.")


Total number of files found: 25

Found 25 files in /notebooks/Mine-folder/output/FWU3/HHE.D_Partial
Batch 1 processed.
Batch 2 processed.
Batch 3 processed.

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   network        25 non-null     object 
 1   station        25 non-null     object 
 2   location       25 non-null     object 
 3   channel        25 non-null     object 
 4   starttime      25 non-null     object 
 5   endtime        25 non-null     object 
 6   sampling_rate  25 non-null     float64
 7   data           25 non-null     object 
dtypes: float64(1), object(7)
memory usage: 1.7+ KB
None
  network station location channel            starttime  \
0      ZZ    FWU3       10     HHE  2019-12-04T00:00:00   
1      ZZ    FWU3       10     HHE  2019-12-18T00:00:00   
2      ZZ    FWU3       10     HHE  2019-1