In [1]:
from obspy import read
from obspy import UTCDateTime
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt

# Load the pick data
pick_data = pd.read_csv(
    "dataset_earthquakes/metadata.csv",
    usecols=['trace_name_original_1', 'trace_p_pick_time', 'trace_s_pick_time', 'source_sensor_distance']
)

def sanitize_filename(name):
    """Sanitize file names for consistent matching"""
    return str(name).replace(".", "").replace(":", "").replace("-", "").replace("_", "").strip().lower()

def calculate_snr(trace, time_point, pre_window=2, post_window=2):
    """
    Calculate SNR for a specific time point using pre and post windows
    """
    sampling_rate = trace.stats.sampling_rate
    
    # Convert time point to sample index
    point_index = int((time_point - trace.stats.starttime) * sampling_rate)
    
    # Calculate window indices
    noise_start = max(0, point_index - int(pre_window * sampling_rate))
    noise_end = point_index
    signal_start = point_index
    signal_end = min(len(trace.data), point_index + int(post_window * sampling_rate))
    
    # Extract windows
    noise_window = trace.data[noise_start:noise_end]
    signal_window = trace.data[signal_start:signal_end]
    
    # Calculate RMS values
    noise_rms = np.sqrt(np.mean(noise_window**2)) if len(noise_window) > 0 else 1e-10
    signal_rms = np.sqrt(np.mean(signal_window**2)) if len(signal_window) > 0 else 0
    
    return signal_rms / noise_rms

def calculate_snr_series(trace, pre_window=2, post_window=2):
    """
    Calculate SNR for entire trace using sliding windows
    """
    sampling_rate = trace.stats.sampling_rate
    window_samples = int((pre_window + post_window) * sampling_rate)
    
    # Calculate SNR at fewer points to improve performance
    step = max(1, int(sampling_rate / 10))  # Calculate SNR every 0.1 seconds
    times = []
    snr_values = []
    
    for i in range(0, len(trace.data) - window_samples, step):
        current_time = trace.stats.starttime + i/sampling_rate
        snr = calculate_snr(trace, current_time, pre_window, post_window)
        times.append(current_time)
        snr_values.append(snr)
    
    return np.array(times), np.array(snr_values)

def find_snr_peaks(times, snr_values, min_snr=1.7, min_distance_samples=10):
    """Find peaks in SNR values above threshold"""
    if len(snr_values) == 0:
        return np.array([]), np.array([])
    
    peaks, properties = find_peaks(snr_values, height=min_snr, distance=min_distance_samples)
    if len(peaks) == 0:
        return np.array([]), np.array([])
    
    return times[peaks], snr_values[peaks]

def validate_p_picks(trace, p_pick_time, time_tolerance=1.0, snr_threshold=1.7):
    """
    Validate P-picks by comparing with SNR peaks
    """
    # Calculate SNR series
    times, snr_values = calculate_snr_series(trace)
    
    # Find SNR peaks
    peak_times, peak_snrs = find_snr_peaks(times, snr_values, snr_threshold)
    
    # Calculate SNR at P-pick time
    p_pick_snr = calculate_snr(trace, p_pick_time)
    
    # Find nearest peak to P-pick
    if len(peak_times) > 0:
        time_diffs = np.abs([t.timestamp - p_pick_time.timestamp for t in peak_times])
        nearest_peak_idx = np.argmin(time_diffs)
        nearest_peak_time = peak_times[nearest_peak_idx]
        nearest_peak_snr = peak_snrs[nearest_peak_idx]
        
        # Check if within tolerance
        is_valid = time_diffs[nearest_peak_idx] <= time_tolerance
    else:
        is_valid = False
        nearest_peak_time = None
        nearest_peak_snr = None
    
    return is_valid, p_pick_snr, nearest_peak_time, nearest_peak_snr, times, snr_values, peak_times, peak_snrs

# Define the range of SNR thresholds to test
snr_thresholds = np.arange(1.0, 2.1, 0.1)

# Store results for each threshold
threshold_results = []

# Main processing loop for each SNR threshold
for snr_threshold in snr_thresholds:
    print(f"\nTesting SNR threshold: {snr_threshold:.1f}")
    
    # Reset counters for each threshold
    correct_matches = 0
    incorrect_matches = 0
    
    # Directory containing miniSEED files
    mseed_dir = "miniSEED_files"
    
    # Loop through each file in the directory
    for file_path in Path(mseed_dir).glob("*.MSEED"):
        mseed_name = sanitize_filename(file_path.name)
        matched_rows = pick_data[pick_data['trace_name_original_1'].apply(sanitize_filename) == mseed_name]
        
        if len(matched_rows) == 0:
            continue
        
        matched_row = matched_rows.iloc[0]
        p_pick = UTCDateTime(matched_row['trace_p_pick_time'])
        stream = read(file_path)
        
        for trace in stream:
            # Validate P-pick with the current SNR threshold
            is_valid, _, _, _, _, _, _, _ = validate_p_picks(trace, p_pick, snr_threshold=snr_threshold)
            
            if is_valid:
                correct_matches += 1
            else:
                incorrect_matches += 1
    
    # Store results for this threshold
    threshold_results.append({
        'snr_threshold': snr_threshold,
        'correct_matches': correct_matches,
        'incorrect_matches': incorrect_matches
    })

# Convert results to a DataFrame
threshold_results_df = pd.DataFrame(threshold_results)

# Print the results
print("\nResults for each SNR threshold:")
print(threshold_results_df)

# Find the threshold with the most correct matches
best_threshold_row = threshold_results_df.loc[threshold_results_df['correct_matches'].idxmax()]
print(f"\nBest SNR threshold: {best_threshold_row['snr_threshold']:.1f}")
print(f"Correct matches: {best_threshold_row['correct_matches']}")
print(f"Incorrect matches: {best_threshold_row['incorrect_matches']}")


Testing SNR threshold: 1.0


TypeError: 'float' object cannot be interpreted as an integer

In [5]:
#gpt version 


from obspy import read
from obspy import UTCDateTime
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt

# Load the pick data and ensure timestamps are read as strings
pick_data = pd.read_csv(
    "dataset_earthquakes/metadata.csv",
    usecols=['trace_name_original_1', 'trace_p_pick_time', 'trace_s_pick_time', 'source_sensor_distance'],
    dtype={'trace_p_pick_time': str, 'trace_s_pick_time': str}  # Ensure datetime columns are read as strings
)

def sanitize_filename(name):
    """Sanitize file names for consistent matching"""
    return str(name).replace(".", "").replace(":", "").replace("-", "").replace("_", "").strip().lower()

def calculate_snr(trace, time_point, pre_window=2, post_window=2):
    """
    Calculate SNR for a specific time point using pre and post windows
    """
    sampling_rate = trace.stats.sampling_rate
    
    # Convert time point to sample index
    point_index = int((time_point - trace.stats.starttime) * sampling_rate)
    
    # Calculate window indices
    noise_start = max(0, point_index - int(pre_window * sampling_rate))
    noise_end = point_index
    signal_start = point_index
    signal_end = min(len(trace.data), point_index + int(post_window * sampling_rate))
    
    # Extract windows
    noise_window = trace.data[noise_start:noise_end]
    signal_window = trace.data[signal_start:signal_end]
    
    # Calculate RMS values
    noise_rms = np.sqrt(np.mean(noise_window**2)) if len(noise_window) > 0 else 1e-10
    signal_rms = np.sqrt(np.mean(signal_window**2)) if len(signal_window) > 0 else 0
    
    return signal_rms / noise_rms

def calculate_snr_series(trace, pre_window=2, post_window=2):
    """
    Calculate SNR for entire trace using sliding windows
    """
    sampling_rate = trace.stats.sampling_rate
    window_samples = int((pre_window + post_window) * sampling_rate)
    
    # Calculate SNR at fewer points to improve performance
    step = max(1, int(sampling_rate / 10))  # Calculate SNR every 0.1 seconds
    times = []
    snr_values = []
    
    for i in range(0, len(trace.data) - window_samples, step):
        current_time = trace.stats.starttime + i/sampling_rate
        snr = calculate_snr(trace, current_time, pre_window, post_window)
        times.append(current_time)
        snr_values.append(snr)
    
    return np.array(times), np.array(snr_values)

def find_snr_peaks(times, snr_values, min_snr=1.7, min_distance_samples=10):
    """Find peaks in SNR values above threshold"""
    if len(snr_values) == 0:
        return np.array([]), np.array([])
    
    peaks, properties = find_peaks(snr_values, height=min_snr, distance=min_distance_samples)
    if len(peaks) == 0:
        return np.array([]), np.array([])
    
    return times[peaks], snr_values[peaks]

def validate_p_picks(trace, p_pick_time, time_tolerance=1.0, snr_threshold=1.7):
    """
    Validate P-picks by comparing with SNR peaks
    """
    # Calculate SNR series
    times, snr_values = calculate_snr_series(trace)
    
    # Find SNR peaks
    peak_times, peak_snrs = find_snr_peaks(times, snr_values, snr_threshold)
    
    # Calculate SNR at P-pick time
    p_pick_snr = calculate_snr(trace, p_pick_time)
    
    # Find nearest peak to P-pick
    if len(peak_times) > 0:
        time_diffs = np.abs([t.timestamp - p_pick_time.timestamp for t in peak_times])
        nearest_peak_idx = np.argmin(time_diffs)
        nearest_peak_time = peak_times[nearest_peak_idx]
        nearest_peak_snr = peak_snrs[nearest_peak_idx]
        
        # Check if within tolerance
        is_valid = time_diffs[nearest_peak_idx] <= time_tolerance
    else:
        is_valid = False
        nearest_peak_time = None
        nearest_peak_snr = None
    
    return is_valid, p_pick_snr, nearest_peak_time, nearest_peak_snr, times, snr_values, peak_times, peak_snrs

# Define the range of SNR thresholds to test
snr_thresholds = np.arange(1.0, 2.1, 0.1)

# Store results for each threshold
threshold_results = []

# Main processing loop for each SNR threshold
for snr_threshold in snr_thresholds:
    print(f"\nTesting SNR threshold: {snr_threshold:.1f}")
    
    # Reset counters for each threshold
    correct_matches = 0
    incorrect_matches = 0
    
    # Directory containing miniSEED files
    mseed_dir = "miniSEED_files"
    
    # Loop through each file in the directory
    for file_path in Path(mseed_dir).glob("*.MSEED"):
        mseed_name = sanitize_filename(file_path.name)
        matched_rows = pick_data[pick_data['trace_name_original_1'].apply(sanitize_filename) == mseed_name]
        
        if len(matched_rows) == 0:
            continue
        
        matched_row = matched_rows.iloc[0]
        
        # Fix: Ensure p_pick time is properly formatted
        try:
            p_pick = UTCDateTime(str(matched_row['trace_p_pick_time']))  # Convert to string first
        except Exception as e:
            print(f"Error parsing UTCDateTime: {e}, value: {matched_row['trace_p_pick_time']}")
            continue
        
        stream = read(file_path)
        
        for trace in stream:
            # Validate P-pick with the current SNR threshold
            is_valid, _, _, _, _, _, _, _ = validate_p_picks(trace, p_pick, snr_threshold=snr_threshold)
            
            if is_valid:
                correct_matches += 1
            else:
                incorrect_matches += 1
    
    # Store results for this threshold
    threshold_results.append({
        'snr_threshold': snr_threshold,
        'correct_matches': correct_matches,
        'incorrect_matches': incorrect_matches
    })

# Convert results to a DataFrame
threshold_results_df = pd.DataFrame(threshold_results)

# Print the results
print("\nResults for each SNR threshold:")
print(threshold_results_df)

# Find the threshold with the most correct matches
best_threshold_row = threshold_results_df.loc[threshold_results_df['correct_matches'].idxmax()]
print(f"\nBest SNR threshold: {best_threshold_row['snr_threshold']:.1f}")
print(f"Correct matches: {best_threshold_row['correct_matches']}")
print(f"Incorrect matches: {best_threshold_row['incorrect_matches']}")



Testing SNR threshold: 1.0
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Error parsing UTCDateTime: 'str' object cannot be interpreted as an integer, value: nan
Erro

In [None]:
from obspy import read
from obspy import UTCDateTime
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt

# Load the pick data
pick_data = pd.read_csv(
    "dataset_earthquakes/metadata.csv",
    usecols=['trace_name_original_1', 'trace_p_pick_time', 'trace_s_pick_time', 'source_sensor_distance']
)

def sanitize_filename(name):
    """Sanitize file names for consistent matching"""
    return str(name).replace(".", "").replace(":", "").replace("-", "").replace("_", "").strip().lower()

def convert_to_utc(value):
    """Ensure timestamps are converted properly to UTCDateTime"""
    try:
        if isinstance(value, (float, int)):  # UNIX timestamp
            return UTCDateTime(value)
        else:  # String timestamp
            return UTCDateTime(str(value))
    except Exception as e:
        print(f"Error converting {value} to UTCDateTime: {e}")
        return None

def calculate_snr(trace, time_point, pre_window=2, post_window=2):
    """Calculate SNR for a specific time point"""
    sampling_rate = trace.stats.sampling_rate
    point_index = int((time_point - trace.stats.starttime) * sampling_rate)
    
    # Calculate window indices
    noise_start = max(0, point_index - int(pre_window * sampling_rate))
    noise_end = point_index
    signal_start = point_index
    signal_end = min(len(trace.data), point_index + int(post_window * sampling_rate))
    
    # Extract windows
    noise_window = trace.data[noise_start:noise_end]
    signal_window = trace.data[signal_start:signal_end]
    
    # Calculate RMS values
    noise_rms = np.sqrt(np.mean(noise_window**2)) if len(noise_window) > 0 else 1e-10
    signal_rms = np.sqrt(np.mean(signal_window**2)) if len(signal_window) > 0 else 0
    
    return signal_rms / noise_rms

def calculate_snr_series(trace, pre_window=2, post_window=2):
    """Calculate SNR for entire trace using sliding windows"""
    sampling_rate = trace.stats.sampling_rate
    step = max(1, int(sampling_rate / 10))  # Calculate SNR every 0.1 seconds
    times = []
    snr_values = []
    
    for i in range(0, len(trace.data) - int((pre_window + post_window) * sampling_rate), step):
        current_time = trace.stats.starttime + i / sampling_rate
        snr = calculate_snr(trace, current_time, pre_window, post_window)
        times.append(current_time)
        snr_values.append(snr)
    
    return np.array(times), np.array(snr_values)

def find_snr_peaks(times, snr_values, min_snr=1.7, min_distance_samples=10):
    """Find peaks in SNR values above threshold"""
    if len(snr_values) == 0:
        return np.array([]), np.array([])
    
    peaks, properties = find_peaks(snr_values, height=min_snr, distance=min_distance_samples)
    return times[peaks], snr_values[peaks]

def validate_p_picks(trace, p_pick_time, time_tolerance=1.0, snr_threshold=1.7):
    """Validate P-picks by comparing with SNR peaks"""
    times, snr_values = calculate_snr_series(trace)
    peak_times, peak_snrs = find_snr_peaks(times, snr_values, snr_threshold)
    p_pick_snr = calculate_snr(trace, p_pick_time)
    
    if len(peak_times) > 0:
        time_diffs = np.abs([t.timestamp - p_pick_time.timestamp for t in peak_times])
        nearest_peak_idx = np.argmin(time_diffs)
        nearest_peak_time = peak_times[nearest_peak_idx]
        nearest_peak_snr = peak_snrs[nearest_peak_idx]
        is_valid = time_diffs[nearest_peak_idx] <= time_tolerance
    else:
        is_valid = False
        nearest_peak_time = None
        nearest_peak_snr = None
    
    return is_valid, p_pick_snr, nearest_peak_time, nearest_peak_snr, times, snr_values, peak_times, peak_snrs

def plot_snr(trace_name, times, snr_values, peak_times, p_pick_time):
    """Plot SNR with detected peaks and P-pick time"""
    plt.figure(figsize=(10, 5))
    plt.plot(times, snr_values, label="SNR", color="green")
    plt.scatter(peak_times, np.full_like(peak_times, np.max(snr_values)), color="purple", label="SNR Peaks")
    plt.axvline(x=p_pick_time.timestamp, color='red', linestyle='dashed', label="P-Pick Time")
    plt.xlabel("Time (UTC)")
    plt.ylabel("SNR")
    plt.legend()
    plt.title(f"SNR Analysis for {trace_name}")
    plt.show()

def process_mseed_files(mseed_dir, snr_threshold=1.7):
    """Process miniSEED files and validate P-pick times using SNR peaks"""
    results = []
    
    for file_path in Path(mseed_dir).glob("*.MSEED"):
        mseed_name = sanitize_filename(file_path.name)
        matched_rows = pick_data[pick_data['trace_name_original_1'].apply(sanitize_filename) == mseed_name]
        
        if matched_rows.empty:
            continue

        p_pick_time = convert_to_utc(matched_rows.iloc[0]['trace_p_pick_time'])
        if p_pick_time is None:
            print(f"Skipping {file_path.name} due to timestamp conversion error.")
            continue
        
        stream = read(file_path)
        for trace in stream:
            is_valid, p_pick_snr, nearest_peak_time, nearest_peak_snr, times, snr_values, peak_times, peak_snrs = validate_p_picks(trace, p_pick_time, snr_threshold=snr_threshold)
            results.append((file_path.name, is_valid, times, snr_values, peak_times, p_pick_time))
    
    return results

# Run processing
mseed_directory = "miniSEED_files"
results = process_mseed_files(mseed_directory)

# Plot results for first valid file
for result in results:
    trace_name, is_valid, times, snr_values, peak_times, p_pick_time = result
    plot_snr(trace_name, times, snr_values, peak_times, p_pick_time)
    break  # Only plot the first one


In [None]:
from obspy import read
from obspy import UTCDateTime
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.signal import find_peaks
import matplotlib.pyplot as plt

# Load pick data
pick_data = pd.read_csv(
    "dataset_earthquakes/metadata.csv",
    usecols=['trace_name_original_1', 'trace_p_pick_time', 'trace_s_pick_time', 'source_sensor_distance'],
    dtype={'trace_p_pick_time': str, 'trace_s_pick_time': str}
)

def sanitize_filename(name):
    """Sanitize file names for consistent matching"""
    return str(name).replace(".", "").replace(":", "").replace("-", "").replace("_", "").strip().lower()

def calculate_snr(trace, time_point, pre_window=2, post_window=2):
    """Calculate SNR for a specific time point"""
    sampling_rate = trace.stats.sampling_rate
    point_index = int((time_point - trace.stats.starttime) * sampling_rate)

    # Define noise and signal windows
    noise_start = max(0, point_index - int(pre_window * sampling_rate))
    noise_end = point_index
    signal_start = point_index
    signal_end = min(len(trace.data), point_index + int(post_window * sampling_rate))

    # Extract windows
    noise_window = trace.data[noise_start:noise_end]
    signal_window = trace.data[signal_start:signal_end]

    # Compute RMS
    noise_rms = np.sqrt(np.mean(noise_window**2)) if len(noise_window) > 0 else 1e-10
    signal_rms = np.sqrt(np.mean(signal_window**2)) if len(signal_window) > 0 else 0

    return signal_rms / noise_rms

def calculate_snr_series(trace, pre_window=2, post_window=2):
    """Calculate SNR for entire trace"""
    sampling_rate = trace.stats.sampling_rate
    step = max(1, int(sampling_rate / 10))  # Compute SNR every 0.1 sec

    times = []
    snr_values = []
    for i in range(0, len(trace.data) - int((pre_window + post_window) * sampling_rate), step):
        current_time = trace.stats.starttime + i / sampling_rate
        snr = calculate_snr(trace, current_time, pre_window, post_window)
        times.append(current_time)
        snr_values.append(snr)

    return np.array(times), np.array(snr_values)

def find_snr_peaks(times, snr_values, min_snr=1.7, min_distance_samples=10):
    """Find SNR peaks above threshold"""
    peaks, _ = find_peaks(snr_values, height=min_snr, distance=min_distance_samples)
    return times[peaks], snr_values[peaks]

def plot_snr(trace_name, times, snr_values, peak_times, p_pick_time, is_valid):
    """Plot SNR with peaks and P-pick time"""
    plt.figure(figsize=(10, 5))
    plt.plot(times, snr_values, color="green", label="SNR")
    plt.scatter(peak_times, np.full_like(peak_times, np.max(snr_values)), color="purple", label="SNR Peaks")
    plt.axvline(x=p_pick_time.timestamp, color='red', linestyle='dashed', label="P-Pick Time")
    
    title = f"SNR Analysis for {trace_name} - {'VALID' if is_valid else 'INVALID'}"
    plt.title(title)
    plt.xlabel("Time (UTC)")
    plt.ylabel("SNR")
    plt.legend()
    plt.show()

# Define SNR threshold range
snr_thresholds = np.arange(1.0, 2.1, 0.1)
threshold_results = []

for snr_threshold in snr_thresholds:
    print(f"\nTesting SNR threshold: {snr_threshold:.1f}")
    
    correct_matches = 0
    incorrect_matches = 0
    mseed_dir = "miniSEED_files"
    
    for file_path in Path(mseed_dir).glob("*.MSEED"):
        mseed_name = sanitize_filename(file_path.name)
        matched_rows = pick_data[pick_data['trace_name_original_1'].apply(sanitize_filename) == mseed_name]

        if len(matched_rows) == 0:
            continue
        
        matched_row = matched_rows.iloc[0]
        
        try:
            p_pick = UTCDateTime(str(matched_row['trace_p_pick_time']))
        except Exception as e:
            print(f"Error parsing UTCDateTime: {e}, value: {matched_row['trace_p_pick_time']}")
            continue
        
        stream = read(file_path)
        for trace in stream:
            is_valid, _, _, _, times, snr_values, peak_times, _ = validate_p_picks(trace, p_pick, snr_threshold=snr_threshold)
            
            if is_valid:
                correct_matches += 1
            else:
                incorrect_matches += 1
            
            # Visualize the match
            plot_snr(file_path.name, times, snr_values, peak_times, p_pick, is_valid)
    
    threshold_results.append({'snr_threshold': snr_threshold, 'correct_matches': correct_matches, 'incorrect_matches': incorrect_matches})

# Convert results to DataFrame
threshold_results_df = pd.DataFrame(threshold_results)

# Bar chart of correct/incorrect matches
plt.figure(figsize=(10, 5))
plt.bar(threshold_results_df["snr_threshold"], threshold_results_df["correct_matches"], width=0.05, label="Correct Matches", color="blue")
plt.bar(threshold_results_df["snr_threshold"], threshold_results_df["incorrect_matches"], width=0.05, label="Incorrect Matches", color="red", alpha=0.6)
plt.xlabel("SNR Threshold")
plt.ylabel("Number of Matches")
plt.legend()
plt.title("SNR Threshold Performance")
plt.show()

# Best SNR threshold
best_threshold_row = threshold_results_df.loc[threshold_results_df['correct_matches'].idxmax()]
print(f"\nBest SNR threshold: {best_threshold_row['snr_threshold']:.1f}")
print(f"Correct matches: {best_threshold_row['correct_matches']}")
print(f"Incorrect matches: {best_threshold_row['incorrect_matches']}")
