This notebook will collect waveform data and format them into a seisbench compatible hdf5 format in order to disseminate the data sets.


by Marine Denolle (mdenolle@uw.edu)



In [1]:
from obspy.clients.fdsn import Client
import numpy as np
import obspy
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import pandas as pd

# from pnwstore.mseed import WaveformClient

In [21]:

# Define clients
client_iris = Client('IRIS')
# client_waveform = WaveformClient()
client_ncedc = Client('NCEDC')


In [9]:
# Read the association table
import os

# Define the file path
file_path = '~/arrival_2010_2015_reloc_cog_ver3.csv'

# Expand the home directory
expanded_path = os.path.expanduser(file_path)

# Check if the file exists
print(f"Checking if file exists at path: {expanded_path}")
if os.path.exists(expanded_path):
    print(f"File exists. Size: {os.path.getsize(expanded_path)} bytes")
    # Try to read the file
    try:
        assoc_df = pd.read_csv(expanded_path)
        print("Successfully read the file!")
    except Exception as e:
        print(f"Error reading the file: {e}")
else:
    print("File does not exist at the specified path!")
    # Look for similar files in the directory
    dir_path = os.path.dirname(expanded_path)
    print(f"Files in {dir_path}:")
    similar_files = [f for f in os.listdir(dir_path) if 'arrival' in f.lower()]
    for file in similar_files:
        print(f" - {file}")

Checking if file exists at path: /Users/marinedenolle/arrival_2010_2015_reloc_cog_ver3.csv
File exists. Size: 45602750 bytes
Successfully read the file!
Successfully read the file!


In [11]:
# Remove the "Unnamed: 0" column which contains only indexes
cleaned_df = assoc_df.drop(columns=["Unnamed: 0"])

# Define the new file path
new_file_path = os.path.join(os.path.dirname(expanded_path), 'arrival_2010_2015_reloc_cog_ver3_cleaned.csv')

# Save the cleaned dataframe to a new CSV file without the index
cleaned_df.to_csv(new_file_path, index=False)

# Verify the new file was created and print its size
if os.path.exists(new_file_path):
    print(f"Successfully saved cleaned data to: {new_file_path}")
    print(f"File size: {os.path.getsize(new_file_path) / (1024 * 1024):.2f} MB")
    
# Display the first few rows of the cleaned dataframe
cleaned_df.head()

Successfully saved cleaned data to: /Users/marinedenolle/arrival_2010_2015_reloc_cog_ver3_cleaned.csv
File size: 65.62 MB


Unnamed: 0,sta,time,arid,iphase,prob,datetime
0,UW.PCMD,1262305000.0,0,P,0.68,2010-01-01 00:15:27.180000000
1,UW.RVW,1262305000.0,1,P,0.68,2010-01-01 00:15:37.840399872
2,UW.PCMD,1262305000.0,2,S,0.68,2010-01-01 00:15:33.280000000
3,UW.GNW,1262305000.0,3,S,0.68,2010-01-01 00:15:42.002000128
4,PB.B013,1262305000.0,4,S,0.68,2010-01-01 00:15:43.618400000


In [22]:

# Display basic information about the dataset
print(f"Number of records: {len(assoc_df)}")
print("First few rows:")
print(assoc_df.head())
print("Columns:", assoc_df.columns.tolist())

# Assuming the epoch time column is named 'time' - adjust if needed
time_column = 'time'  # Change this if your column has a different name
if time_column in assoc_df.columns:
    # Convert epoch time to datetime
    assoc_df['datetime'] = pd.to_datetime(assoc_df[time_column], unit='s')
    
    # Create a function to convert to UTCDateTime when needed
    def to_utc_datetime(dt):
        return obspy.UTCDateTime(dt)
    
    # Example conversion
    print("\nExample time conversion:")
    example = assoc_df.iloc[0]
    print(f"Epoch: {example[time_column]}")
    print(f"Datetime: {example['datetime']}")
    print(f"UTCDateTime: {to_utc_datetime(example['datetime'])}")

# Extract unique station information
station_col = 'station'  # Change if needed
network_col = 'network'  # Change if needed

if station_col in assoc_df.columns and network_col in assoc_df.columns:
    station_list = assoc_df[[network_col, station_col]].drop_duplicates().reset_index(drop=True)
    print(f"\nFound {len(station_list)} unique stations:")
    print(station_list.head(10))

Number of records: 1004335
First few rows:
   Unnamed: 0      sta          time  arid iphase  prob  \
0           0  UW.PCMD  1.262305e+09     0      P  0.68   
1           1   UW.RVW  1.262305e+09     1      P  0.68   
2           2  UW.PCMD  1.262305e+09     2      S  0.68   
3           3   UW.GNW  1.262305e+09     3      S  0.68   
4           4  PB.B013  1.262305e+09     4      S  0.68   

                       datetime  
0 2010-01-01 00:15:27.180000000  
1 2010-01-01 00:15:37.840399872  
2 2010-01-01 00:15:33.280000000  
3 2010-01-01 00:15:42.002000128  
4 2010-01-01 00:15:43.618400000  
Columns: ['Unnamed: 0', 'sta', 'time', 'arid', 'iphase', 'prob', 'datetime']

Example time conversion:
Epoch: 1262304927.18
Datetime: 2010-01-01 00:15:27.180000
UTCDateTime: 2010-01-01T00:15:27.180000Z


In [None]:
# Extract network and station from sta column
df = cleaned_df.copy()
df[['network', 'station']] = df['sta'].str.split('.', expand=True)

# Set constants
sampling_rate = 100  # Hz
pre_arrival_time = 3  # seconds before first arrival
post_arrival_time = 60  # seconds before first arrival

# Create a unique event ID for each unique time
# This assumes different events have different timestamps
df['event_id'] = 'ev' + df['time'].astype(str).str.replace('.', '_')

# Group by event_id, network and station to combine P and S arrivals
rows = []
for (event_id, network, station), group in df.groupby(['event_id', 'network', 'station']):
    # Find P and S arrivals
    p_arrival = group[group['iphase'] == 'P']
    s_arrival = group[group['iphase'] == 'S']
    if s_arrival.empty:
        print(f"No S arrival for event {event_id} at station {station}")
        
    
    # Calculate trace start time (30 seconds before the first arrival)
    first_arrival = group['time'].min()
    trace_start = first_arrival - pre_arrival_time
    trace_end = first_arrival + post_arrival_time  # 30 seconds after the first arrival

    # Convert to UTCDateTime
    trace_start1 = obspy.UTCDateTime(trace_start).strftime('%Y-%m-%dT%H:%M:%S')
    trace_end1 = obspy.UTCDateTime(trace_end).strftime('%Y-%m-%dT%H:%M:%S')
    print(f"Trace start: {trace_start}, Trace end: {trace_end}")
    # download the waveform data
    print(network, station, trace_start)
    sta = client_iris.get_stations(network=network, station=station,location="*",channel="*", \
                                   starttime=trace_start1 , endtime=trace_end1)
    print(sta)
    waveform = client_iris.get_waveforms(network=network, station=station, location='*',channel= '*',\
                                          starttime=trace_start1 , endtime= trace_end1)
    print(waveform.data.shape)
    #save the waveform data to a seisbench format
    # h5_file = f"{network}.{station}..HHZ.{trace_start.strftime('%Y%m%dT%H%M%S')}.h5"
    # waveform.write(h5_file, format='HDF5')

    
    # Calculate P and S samples
    p_sample = None
    if not p_arrival.empty:
        p_time = p_arrival['time'].iloc[0]
        p_sample = int((p_time - trace_start) * sampling_rate)
    
    s_sample = None
    if not s_arrival.empty:
        s_time = s_arrival['time'].iloc[0]
        s_sample = int((s_time - trace_start) * sampling_rate)
    
    # Create row
    row = {
        "event_id": event_id,
        "source_type": "earthquake",
        "station_network_code": network,
        "station_channel_code": "?H",  # Default channel code
        "station_code": station,
        "station_location_code": "",    # Default location code
        "station_latitude_deg": None,   # Station metadata not available
        "station_longitude_deg": None,
        "station_elevation_m": None,
        "trace_name": ,  # Network.Station.Location.Channel
        "trace_sampling_rate_hz": sampling_rate,
        "trace_start_time": trace_start,
        "trace_S_arrival_sample": s_sample,
        "trace_S_onset": "impulsive" if s_sample is not None else None,
        "trace_P_arrival_sample": p_sample,
        "trace_P_onset": "impulsive" if p_sample is not None else None,
        "trace_snr_db": None  # No SNR information available
    }
    

    rows.append(row)

# Create the seisbench metadata dataframe
seisbench_df = pd.DataFrame(rows)

# Display the first few rows
print(f"Created {len(seisbench_df)} metadata entries")
seisbench_df.head()

  df['event_id'] = 'ev' + df['time'].astype(str).str.replace('.', '_')


No S arrival for event ev1262304927_18 at station PCMD
Trace start: 1262304924.18, Trace end: 1262304987.18
UW PCMD 1262304924.18
Inventory created at 2025-05-23T14:13:47.966400Z
	Created by: IRIS WEB SERVICE: fdsnws-station | version: 1.1.52
		    http://service.iris.edu/fdsnws/station/1/query?starttime=2010-01-...
	Sending institution: IRIS-DMC (IRIS-DMC)
	Contains:
		Networks (1):
			UW
		Stations (1):
			UW.PCMD (PC Mountain Detachment ANSS-SMO)
		Channels (0):

Inventory created at 2025-05-23T14:13:47.966400Z
	Created by: IRIS WEB SERVICE: fdsnws-station | version: 1.1.52
		    http://service.iris.edu/fdsnws/station/1/query?starttime=2010-01-...
	Sending institution: IRIS-DMC (IRIS-DMC)
	Contains:
		Networks (1):
			UW
		Stations (1):
			UW.PCMD (PC Mountain Detachment ANSS-SMO)
		Channels (0):



TypeError: starttime and endtime must be UTCDateTime objects or None for this call to Stream.trim()

In [None]:
# Verify the saved H5 file
import h5py
import pandas as pd
output_dir = os.path.expanduser('~/seisbench_data')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Path to the generated H5 file
h5_path = os.path.join(output_dir, 'cascadia_waveforms.h5')

if os.path.exists(h5_path):
    with h5py.File(h5_path, 'r') as f:
        # Print basic file information
        print(f"SeisBench version: {f.attrs.get('seisbench_version')}")
        
        # List groups
        print("\nGroups in file:")
        for group_name in f.keys():
            print(f" - {group_name}")
        
        # Print metadata statistics
        print("\nMetadata statistics:")
        if 'metadata' in f:
            metadata_group = f['metadata']
            for key in metadata_group.keys():
                data = metadata_group[key][:]
                print(f" - {key}: {len(data)} entries")
                
            # Convert metadata to DataFrame for viewing
            metadata_dict = {key: metadata_group[key][:] for key in metadata_group.keys()}
            metadata_df = pd.DataFrame(metadata_dict)
            print("\nSample of metadata:")
            display(metadata_df.head())
        
        # Print waveform statistics
        print("\nWaveform statistics:")
        if 'waveforms' in f:
            waveforms_group = f['waveforms']
            print(f" - Number of waveforms: {len(waveforms_group.keys())}")
            
            # Show information for first few waveforms
            print("\nSample of waveforms:")
            for i, key in enumerate(list(waveforms_group.keys())[:5]):
                waveform = waveforms_group[key][:]
                print(f" - {key}: shape={waveform.shape}, min={waveform.min():.2f}, max={waveform.max():.2f}")
                
            # Plot a sample waveform if matplotlib is available
            if list(waveforms_group.keys()):
                sample_key = list(waveforms_group.keys())[0]
                sample_waveform = waveforms_group[sample_key][:]
                
                plt.figure(figsize=(12, 4))
                plt.plot(sample_waveform)
                plt.title(f"Sample Waveform: {sample_key}")
                plt.xlabel("Samples")
                plt.ylabel("Amplitude")
                plt.tight_layout()
                plt.show()
else:
    print(f"H5 file not found at {h5_path}")
</VSCode.Cell>