In [None]:
def find_miniseed_files(directory):
    print(f"\nSearching in directory: {directory}")
    
    # Print the full directory structure
    for root, dirs, files in os.walk(directory):
        print(f"\nCurrent directory: {root}")
        print("Subdirectories:", dirs)
        print("Files:", files)
        
        # Print full paths of all files
        for file in files:
            file_path = os.path.join(root, file)
            print(f"File found: {file_path}")
            
            # Check if each file exists
            if os.path.exists(file_path):
                print(f"File exists: {file_path}")
            else:
                print(f"File does not exist: {file_path}")

# Find all miniSEED files in the directory structure
miniseed_files = find_miniseed_files(base_directory)

In [None]:
import os
import pandas as pd
import pyarrow.parquet as pq

# Specify the base directory for your Parquet files
base_directory = '/notebooks/parquet1'  # All output files will be in this path

# Function to recursively find all Parquet files
def find_parquet_files(directory):
    parquet_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.parquet'):
                parquet_files.append(os.path.join(root, file))
    return parquet_files

# Find all Parquet files in the directory structure
parquet_files = find_parquet_files(base_directory)

if not parquet_files:
    print(f"No Parquet files found in {base_directory}")
else:
    print(f"Found {len(parquet_files)} Parquet files in {base_directory}")

    # Read all Parquet files into a single DataFrame
    dfs = []
    for file in parquet_files:
        try:
            df = pd.read_parquet(file)
            dfs.append(df)
            # print(f"Successfully read: {file}")
        except Exception as e:
            print(f"Error reading {file}: {str(e)}")

    # Combine all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Display information about the combined DataFrame
    print("\nCombined DataFrame:")
    print(combined_df.info())
    print(combined_df.head())
    print(combined_df.tail())

    # Print some statistics
    print("\nData statistics:")
    print(combined_df['data'].describe())


Found 1 Parquet files in /notebooks/parquet1

Combined DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   network        1 non-null      object        
 1   station        1 non-null      object        
 2   location       1 non-null      object        
 3   channel        1 non-null      object        
 4   starttime      1 non-null      datetime64[ns]
 5   endtime        1 non-null      datetime64[ns]
 6   sampling_rate  1 non-null      float64       
 7   data           1 non-null      object        
dtypes: datetime64[ns](2), float64(1), object(5)
memory usage: 196.0+ bytes
None
  network station location channel  starttime                 endtime  \
0      ZB       1       10     EH2 2019-12-20 2019-12-20 02:07:59.999   

   sampling_rate                                               data  
0         1000.0  [-0.0005

In [6]:
import os
import obspy
import pandas as pd

# Specify the base directory for your miniSEED files
base_directory = '/notebooks/output/1/EH1.D'

def find_all_files(directory):
    all_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            all_files.append(file_path)
    return all_files

# Find all files in the directory structure
all_files = find_all_files(base_directory)

print(f"\nTotal number of files found: {len(all_files)}")

if not all_files:
    print("\nNo files found in any directory.")
else:
    print(f"\nFound {len(all_files)} files in {base_directory}")

    # Initialize empty lists to store data
    networks = []
    stations = []
    locations = []
    channels = []
    starttimes = []
    endtimes = []
    sampling_rates = []
    npts = []
    data = []

    # Process files one at a time
    for file in all_files:
        try:
            stream = obspy.read(file)
            print(f"Processing: {file}")
            
            for trace in stream:
                networks.append(trace.stats.network)
                stations.append(trace.stats.station)
                locations.append(trace.stats.location)
                channels.append(trace.stats.channel)
                starttimes.append(trace.stats.starttime.datetime)
                endtimes.append(trace.stats.endtime.datetime)
                sampling_rates.append(trace.stats.sampling_rate)
                npts.append(trace.stats.npts)
                data.append(trace.data.tolist())
                
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")

    # Create DataFrame
    df = pd.DataFrame({
        'network': networks,
        'station': stations,
        'location': locations,
        'channel': channels,
        'starttime': starttimes,
        'endtime': endtimes,
        'sampling_rate': sampling_rates,
        'npts': npts,
        'data': data
    })

    # Display information about the DataFrame
    print("\nDataFrame Information:")
    print(df.info())
    print(df.head())
    print(df.tail())

    # Print some statistics
    print("\nData Statistics:")
    print(df.describe())


Total number of files found: 2

Found 2 files in /notebooks/output/1/EH1.D
Processing: /notebooks/output/1/EH1.D/ZB.1.10.EH1.D.2019.305


: 