# Wavelet Transform



In [1]:
import os
import pandas as pd
import numpy as np
from obspy import read
from tqdm import tqdm
import scipy.stats as stats
import matplotlib.pyplot as plt
import pywt

# Paths
augmented_data_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/training_augmented'
features_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/features'

# Create features directory if it doesn't exist
if not os.path.exists(features_path):
    os.makedirs(features_path)

In [2]:
def extract_wavelet_features(signal, wavelet='db4', level=4):
    """Extract statistical features from wavelet decomposition of a signal.
    Args:
        signal: Input signal array
        wavelet: Wavelet type to use
        level: Decomposition level
    Returns:
        array: Feature vector containing statistical measures"""
    # Perform wavelet decomposition
    coeffs = pywt.wavedec(signal, wavelet, level=level)
    
    # Initialize feature list
    features = []
    
    # Extract features from each coefficient level
    for coef in coeffs:
        # Statistical features
        features.extend([
            np.mean(coef),           # Mean
            np.std(coef),            # Standard deviation
            stats.skew(coef),        # Skewness
            stats.kurtosis(coef),    # Kurtosis
            np.percentile(coef, 75), # 75th percentile
            np.percentile(coef, 25), # 25th percentile
            np.max(coef),            # Maximum
            np.min(coef),            # Minimum
            np.sum(np.abs(coef)),    # L1 norm
            np.sqrt(np.sum(coef**2)),# L2 norm
            stats.entropy(np.abs(coef)), # Signal entropy
            np.median(np.abs(coef))  # Median absolute deviation
        ])
        
    return np.array(features)

def process_seismic_files(data_path, arrival_times_csv):
    """Process all seismic files and extract wavelet features.
    Args:
        data_path: Path to directory containing MSEED files
        arrival_times_csv: Path to CSV with arrival times
    Returns:
        tuple: (features array, arrival times array, file names)"""
    # Read arrival times
    arrivals_df = pd.read_csv(arrival_times_csv)
    
    features_list = []
    arrival_times = []
    file_names = []
    
    print('Extracting wavelet features...')
    for _, row in tqdm(arrivals_df.iterrows(), total=len(arrivals_df)):
        file_path = os.path.join(data_path, row['augmented_file'])
        
        try:
            # Read seismic signal
            st = read(file_path)
            signal = st[0].data
            
            # Extract features
            features = extract_wavelet_features(signal)
            features_list.append(features)
            arrival_times.append(row['arrival_time'])
            file_names.append(row['augmented_file'])
            
        except Exception as e:
            print(f'Error processing {file_path}: {str(e)}')
            continue
    
    return np.array(features_list), np.array(arrival_times), file_names

def process_seismic_files_no_times(data_path):
    """Process seismic files and extract wavelet features without arrival times.
    Args:
        data_path: Path to directory containing MSEED files
    Returns:
        tuple: (features array, file names)"""
    features_list = []
    file_names = []
    
    # Get all MSEED files in directory
    mseed_files = [f for f in os.listdir(data_path) if f.endswith('.mseed')]
    
    print('Extracting wavelet features...')
    for filename in tqdm(mseed_files):
        file_path = os.path.join(data_path, filename)
        
        try:
            # Read seismic signal
            st = read(file_path)
            signal = st[0].data
            
            # Extract features
            features = extract_wavelet_features(signal)
            features_list.append(features)
            file_names.append(filename)
            
        except Exception as e:
            print(f'Error processing {file_path}: {str(e)}')
            continue
    
    return np.array(features_list), file_names



In [None]:
# Example usage:
data_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/raw/test'
X, files = process_seismic_files_no_times(data_path)

# Save features and filenames
output_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/features'
np.save(os.path.join(output_path, 'wavelet_features_only_testing.npy'), X)
pd.DataFrame({'file': files}).to_csv(
    os.path.join(output_path, 'feature_files_only_testing.csv'), index=False)

print(f'Extracted features shape: {X.shape}')
print(f'Number of processed files: {len(files)}')

Extracting wavelet features...


100%|██████████| 790/790 [00:40<00:00, 19.63it/s]

Extracted features shape: (790, 60)
Number of processed files: 790





In [None]:


# Process all files
augmented_path = os.path.join(augmented_data_path, 'augmented')
arrival_times_csv = os.path.join(augmented_data_path, 'arrival_times.csv')

X, y, files = process_seismic_files(augmented_path, arrival_times_csv)

# Save features and metadata
np.save(os.path.join(features_path, 'wavelet_features.npy'), X)
np.save(os.path.join(features_path, 'arrival_times.npy'), y)
pd.DataFrame({'file': files}).to_csv(
    os.path.join(features_path, 'feature_files.csv'), index=False)

print(f'Extracted features shape: {X.shape}')
print(f'Number of samples: {len(files)}')

# Update the feature extraction info
print('Features extracted per coefficient level:', 12)
print('Total features for 4 levels:', 12 * (4 + 1))  # 4 detail + 1 approximation

Extracting wavelet features...


  0%|          | 0/4989 [00:00<?, ?it/s]

100%|██████████| 4989/4989 [03:34<00:00, 23.21it/s] 


Extracted features shape: (4989, 60)
Number of samples: 4989
Features extracted per coefficient level: 12
Total features for 4 levels: 60


In [11]:
# Show example of features for one file
example_idx = 0
print(f'Features for file {files[example_idx]}:')
print('Feature vector length:', len(X[example_idx]))
print('\nFirst 10 features:')
print(X[example_idx][:10])
print(f'\nArrival time: {y[example_idx]:.2f}s')

Features for file 01010056.mseed:
Feature vector length: 60

First 10 features:
[-2.26887150e-03  1.36387755e-01  2.11445763e+00  9.02215459e+01
  1.74969649e-02 -1.97850397e-02  1.76067501e+00 -1.31033293e+00
  1.88985862e+01  2.85151844e+00]

Arrival time: 30.60s


In [10]:
import numpy as np

# Cargar los archivos
features_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/features'
X = np.load(os.path.join(features_path, 'wavelet_features.npy'))
y = np.load(os.path.join(features_path, 'arrival_times.npy'))

# Verificar las dimensiones
print(f"Forma de X: {X.shape}")
print(f"Forma de y: {y.shape}")

# Ver los primeros elementos
print("\nPrimeros 5 elementos de X:")
print(X[:5])
print("\nPrimeros 5 elementos de y:")
print(y[:5])

Forma de X: (4989, 60)
Forma de y: (4989,)

Primeros 5 elementos de X:
[[-2.26887150e-03  1.36387755e-01  2.11445763e+00  9.02215459e+01
   1.74969649e-02 -1.97850397e-02  1.76067501e+00 -1.31033293e+00
   1.88985862e+01  2.85151844e+00  4.90902773e+00  1.87466164e-02
  -4.68282065e-03  4.36041154e-01 -1.00966551e+00  3.50142234e+01
   7.43629529e-02 -8.48785630e-02  3.57025540e+00 -3.56212458e+00
   7.44136070e+01  9.11576754e+00  5.07205141e+00  8.28264720e-02
   5.64923997e-02  1.62441522e+00  4.88074603e+00  1.10670185e+02
   2.23716437e-01 -2.32970680e-01  2.68300654e+01 -1.51858868e+01
   4.54496807e+02  4.78871929e+01  5.62304109e+00  2.28371860e-01
   3.33488387e-05  1.16491400e+00 -5.91348512e-01  7.28446100e+01
   1.35390393e-01 -1.25288355e-01  1.26437588e+01 -1.69246391e+01
   6.01567335e+02  4.84525808e+01  6.10403926e+00  1.28995578e-01
  -1.18978216e-05  1.56823783e-01 -9.33989454e-01  7.02097351e+01
   2.04853454e-02 -1.97404723e-02  1.95495840e+00 -2.39828782e+00
   1.

## Procesamiento del conjunto de prueba

Procesamos el conjunto de prueba usando las mismas funciones de extracción de características que usamos para el conjunto de entrenamiento.

In [None]:
# Definir rutas
testing_data_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/testing'
test_csv_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/raw/VT_P_training.csv'

def process_arrival_times(data_path, data_name):
    """Procesa los tiempos de llegada para el conjunto de prueba."""
    # Leer CSV con tiempos de llegada de prueba
    test_df = pd.read_csv(test_csv_path)
    
    # Preparar DataFrame para almacenar tiempos de llegada
    test_times_df = pd.DataFrame(columns=['file', 'arrival_time'])
    
    print('Procesando tiempos de llegada del conjunto de prueba...')
    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        file_id = row['archivo']
        mseed_file = f"{file_id:08d}.mseed"
        file_path = os.path.join(data_path, mseed_file)
        
        try:
            # Leer señal y obtener tiempo relativo
            st = read(file_path)
            absolute_p_time = row['lec_p']
            relative_p_time = absolute_p_time - st[0].stats.starttime.timestamp
            
            # Agregar al DataFrame
            test_times_df = pd.concat([test_times_df, pd.DataFrame([{
                'file': mseed_file,
                'arrival_time': relative_p_time
            }])], ignore_index=True)
            
        except Exception as e:
            continue
    
    # Guardar tiempos de llegada
    test_times_df.to_csv(os.path.join(features_path, f'{data_name}.csv'), index=False)
    np.save(os.path.join(features_path, data_name), test_times_df['arrival_time'].values)
    
    print(f'Tiempos de llegada guardados en {data_path}/{data_name}.csv')
    print(f'y {features_path}/test_arrival_times.npy')
    
    return test_times_df

# Procesar tiempos de llegada
test_times_df = process_arrival_times(testing_data_path, 'test_arrival_times')

# Mostrar algunos ejemplos
#print('\nPrimeros 5 tiempos de llegada:')
# print(test_times_df.head())


val_data_path = '/mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/training_augmented/val'
val_times_df = process_arrival_times(val_data_path, 'val_arrival_times')
val_times_df.head()

Procesando tiempos de llegada del conjunto de prueba...


  0%|          | 0/2500 [00:00<?, ?it/s]

  test_times_df = pd.concat([test_times_df, pd.DataFrame([{
100%|██████████| 2500/2500 [00:21<00:00, 115.43it/s]

Tiempos de llegada guardados en /mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/used_data/training_augmented/val/val_arrival_times.csv
y /mnt/c/Users/Usuario/Documents/Studies/GicoProject/SeismicWaves/data/procesed/features/test_arrival_times.npy





Unnamed: 0,file,arrival_time
0,04010919.mseed,14.84
1,04020130.mseed,11.45
2,04021826.mseed,30.93
3,04031203.mseed,30.62
4,04040354.mseed,30.26


In [14]:
val_times_df.info()
test_times_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   file          317 non-null    object 
 1   arrival_time  317 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   file          496 non-null    object 
 1   arrival_time  496 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.9+ KB
