In [1]:
import os
import pandas as pd
import pyarrow.parquet as pq

# Specify the base directory for your Parquet files
base_directory = '/notebooks/output/ZZ/'  # All output files will be in this path

# Function to recursively find all Parquet files
def find_parquet_files(directory):
    parquet_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.parquet'):
                parquet_files.append(os.path.join(root, file))
    return parquet_files

# Find all Parquet files in the directory structure
parquet_files = find_parquet_files(base_directory)

if not parquet_files:
    print(f"No Parquet files found in {base_directory}")
else:
    print(f"Found {len(parquet_files)} Parquet files in {base_directory}")

    # Read all Parquet files into a single DataFrame
    dfs = []
    for file in parquet_files:
        try:
            df = pd.read_parquet(file)
            dfs.append(df)
            # print(f"Successfully read: {file}")
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")

    # Combine all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Display information about the combined DataFrame
    print("\nCombined DataFrame:")
    print(combined_df.info())
    print(combined_df.head())
    print(combined_df.tail())

    # Print some statistics
    print("\nData statistics:")
    print(combined_df['data'].describe())


Found 60 Parquet files in /notebooks/output/ZZ/

Combined DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   network        60 non-null     object        
 1   station        60 non-null     object        
 2   location       60 non-null     object        
 3   channel        60 non-null     object        
 4   starttime      60 non-null     datetime64[ns]
 5   endtime        60 non-null     datetime64[ns]
 6   sampling_rate  60 non-null     float64       
 7   data           60 non-null     object        
dtypes: datetime64[ns](2), float64(1), object(5)
memory usage: 3.9+ KB
None
  network station location channel  starttime                 endtime  \
0      ZZ    FWU3       10     HHE 2019-12-27 2019-12-27 01:55:33.216   
1      ZZ    FWU3       10     HHN 2019-12-27 2019-12-27 01:55:33.468   
2      ZZ    FWU3       

In [2]:
combined_df.describe()

Unnamed: 0,starttime,endtime,sampling_rate
count,60,60,60.0
mean,2019-12-29 13:12:00,2019-12-30 06:47:43.274866432,250.0
min,2019-12-17 00:00:00,2019-12-17 15:38:55.272000,250.0
25%,2019-12-30 12:00:00,2019-12-30 17:13:37.558000128,250.0
50%,2019-12-31 00:00:00,2019-12-31 23:59:59.996000,250.0
75%,2019-12-31 00:00:00,2019-12-31 23:59:59.996000,250.0
max,2019-12-31 00:00:00,2019-12-31 23:59:59.996000,250.0
std,,,0.0
