In [3]:
from pathlib import Path
import yaml
import obspy
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent

with open(PROJECT_ROOT / "config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

DATA_ROOT = Path(cfg["paths"]["data_root"])
LABELED_DATA = Path(cfg["paths"]["labeled_data"])
UNLABELED_DATA = Path(cfg["paths"]["unlabeled_data"])

FIGURES_DIR = PROJECT_ROOT / cfg["paths"]["figures_output"]
TABLES_DIR = PROJECT_ROOT / cfg["paths"]["tables_output"]
NOTEBOOKS_DIR = PROJECT_ROOT / "notebooks"

FIGURES_DIR.mkdir(exist_ok=True)
TABLES_DIR.mkdir(exist_ok=True)

print("PROJECT_ROOT :", PROJECT_ROOT)
print("FIGURES_DIR  :", FIGURES_DIR.resolve())
print("TABLES_DIR   :", TABLES_DIR.resolve())


PROJECT_ROOT : c:\Users\Disc\Desktop\pj
FIGURES_DIR  : C:\Users\Disc\Desktop\pj\figures
TABLES_DIR   : C:\Users\Disc\Desktop\pj\tables


In [4]:
n_files = sum(1 for p in UNLABELED_DATA.rglob("*") if p.is_file())
print(f"Archivos en UNLABELED_DATA: {n_files}")


Archivos en UNLABELED_DATA: 13761


In [5]:
from pathlib import Path
import pandas as pd

files = [p.name for p in UNLABELED_DATA.rglob("*") if p.is_file()]

records = []
for f in files:
    parts = f.split(".")
    if len(parts) == 6:
        net, sta, loc, chan, year, doy = parts
        records.append({
            "network": net,
            "station": sta,
            "location": loc,
            "channel": chan,
            "year": int(year),
            "doy": int(doy),
            "filename": f
        })

df = pd.DataFrame(records)
print(df.head())


  network station location channel  year  doy                 filename
0      CL    LC03       01     HHZ  2021  141  CL.LC03.01.HHZ.2021.141
1      CL    LC03       01     HHZ  2021  151  CL.LC03.01.HHZ.2021.151
2      CL    LC03       01     HHN  2021  141  CL.LC03.01.HHN.2021.141
3      CL    LC03       01     HHN  2021  151  CL.LC03.01.HHN.2021.151
4      CL    LC03       01     HHE  2021  141  CL.LC03.01.HHE.2021.141


In [6]:
stations = sorted(df["station"].unique())
print("Estaciones:")
for s in stations:
    print(" -", s)


Estaciones:
 - LC01
 - LC02
 - LC03
 - LC04
 - LC05
 - LC06
 - LC07
 - LC08
 - LC09
 - LC10
 - LC11


In [7]:
events_per_station = df.groupby("station").size().sort_values(ascending=False)
print(events_per_station)


station
LC03    3321
LC01    3180
LC02    2661
LC04    1914
LC06     450
LC08     450
LC09     447
LC10     447
LC07     429
LC11     279
LC05     183
dtype: int64


In [8]:
events_per_channel = df.groupby("channel").size()
print(events_per_channel)


channel
HHE    4587
HHN    4587
HHZ    4587
dtype: int64


In [9]:
station_channel = df.groupby(["station", "channel"]).size().unstack(fill_value=0)
print(station_channel)


channel   HHE   HHN   HHZ
station                  
LC01     1060  1060  1060
LC02      887   887   887
LC03     1107  1107  1107
LC04      638   638   638
LC05       61    61    61
LC06      150   150   150
LC07      143   143   143
LC08      150   150   150
LC09      149   149   149
LC10      149   149   149
LC11       93    93    93


In [10]:
events_per_year = df.groupby("year").size().sort_index()
print(events_per_year)


year
2018    4200
2021    1956
2022    3447
2023    3363
2024     795
dtype: int64


In [11]:
events_station_year = (
    df.groupby(["year", "station"])
      .size()
      .unstack(fill_value=0)
      .sort_index()
)

print(events_station_year)


station  LC01  LC02  LC03  LC04  LC05  LC06  LC07  LC08  LC09  LC10  LC11
year                                                                     
2018      333     6   483   693   183   450   429   450   447   447   279
2021      654   654   648     0     0     0     0     0     0     0     0
2022      948   945   945   609     0     0     0     0     0     0     0
2023     1095   810  1095   363     0     0     0     0     0     0     0
2024      150   246   150   249     0     0     0     0     0     0     0


In [12]:
events_full = (
    df[df["channel"].isin(["HHE", "HHN", "HHZ"])]
    .groupby(["year", "station", "channel"])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

print(events_full)


channel       HHE  HHN  HHZ
year station               
2018 LC01     111  111  111
     LC02       2    2    2
     LC03     161  161  161
     LC04     231  231  231
     LC05      61   61   61
     LC06     150  150  150
     LC07     143  143  143
     LC08     150  150  150
     LC09     149  149  149
     LC10     149  149  149
     LC11      93   93   93
2021 LC01     218  218  218
     LC02     218  218  218
     LC03     216  216  216
2022 LC01     316  316  316
     LC02     315  315  315
     LC03     315  315  315
     LC04     203  203  203
2023 LC01     365  365  365
     LC02     270  270  270
     LC03     365  365  365
     LC04     121  121  121
2024 LC01      50   50   50
     LC02      82   82   82
     LC03      50   50   50
     LC04      83   83   83


In [13]:
subset_df = df[
    (df["station"] == "LC01") &
    (df["year"].isin([2022, 2023])) &
    (df["channel"].isin(["HHE", "HHN", "HHZ"]))
].copy()

print(subset_df.groupby(["year", "channel"]).size())


year  channel
2022  HHE        316
      HHN        316
      HHZ        316
2023  HHE        365
      HHN        365
      HHZ        365
dtype: int64


In [None]:
# VERSIÓN OPTIMIZADA - Copia esto directamente en tu notebook
# Usa MÍNIMA RAM escribiendo incrementalmente a disco

from obspy import read
import pandas as pd
from tqdm import tqdm
import numpy as np
import gc

# Configuración
year = 2022
station = "LC01"
channel = "HHZ"
output_file = f"seismic_{station}_{channel}_{year}.csv"

# Filtrar archivos
files_2022 = subset_df[
    (subset_df["year"] == year) &
    (subset_df["station"] == station) &
    (subset_df["channel"] == channel)
].sort_values("doy")

print(f"Archivos a procesar: {len(files_2022)}")
print(f"Escribiendo en: {output_file}")
print("Modo: Escritura incremental (baja RAM)\n")

# Variables de control
total_samples = 0
files_processed = 0
chunk_size = 100  # Escribir cada N archivos
buffer = []

# Crear archivo (vacío inicial)
write_header = True
mode = 'w'

for idx, row in tqdm(files_2022.iterrows(), total=len(files_2022), desc="Procesando"):
    file_path = UNLABELED_DATA / row["filename"]
    
    try:
        # Leer y decimar
        st = read(file_path)
        st.decimate(factor=10, no_filter=False)  # 200 Hz -> 20 Hz
        
        for tr in st:
            # Calcular timestamps de forma vectorizada (MUCHO más rápido)
            start_time = tr.stats.starttime
            sampling_rate = tr.stats.sampling_rate
            n_samples = tr.stats.npts
            
            # Array de tiempos
            time_offsets = np.arange(n_samples) / sampling_rate
            timestamps = [start_time + offset for offset in time_offsets]
            timestamps_dt = [t.datetime for t in timestamps]
            
            # DataFrame del trace
            df_trace = pd.DataFrame({
                'datetime': timestamps_dt,
                'value': tr.data.astype(np.float32)  # float32 ahorra 50% de memoria
            })
            
            buffer.append(df_trace)
            total_samples += len(df_trace)
        
        files_processed += 1
        
        # Escribir a disco cada chunk_size archivos
        if files_processed % chunk_size == 0:
            if buffer:
                # Concatenar buffer
                df_chunk = pd.concat(buffer, ignore_index=True)
                
                # Escribir a CSV
                df_chunk.to_csv(
                    output_file,
                    mode=mode,
                    header=write_header,
                    index=False,
                    date_format='%Y-%m-%d %H:%M:%S.%f'
                )
                
                # Limpiar memoria
                buffer.clear()
                del df_chunk
                gc.collect()
                
                # Después del primer write, usar append mode
                mode = 'a'
                write_header = False
                
                print(f"✓ Escritos {total_samples:,} samples ({files_processed}/{len(files_2022)} archivos)")
    
    except Exception as e:
        print(f"✗ Error en {row['filename']}: {e}")
        continue

# Escribir último chunk
if buffer:
    df_chunk = pd.concat(buffer, ignore_index=True)
    df_chunk.to_csv(
        output_file,
        mode=mode,
        header=write_header,
        index=False,
        date_format='%Y-%m-%d %H:%M:%S.%f'
    )
    del df_chunk
    buffer.clear()
    gc.collect()

# Resumen final
import os
file_size_mb = os.path.getsize(output_file) / (1024**2)

print(f"\n{'='*60}")
print(f"✓ CONVERSIÓN COMPLETADA")
print(f"{'='*60}")
print(f"  Archivos procesados: {files_processed}")
print(f"  Muestras totales: {total_samples:,}")
print(f"  Archivo: {output_file}")
print(f"  Tamaño: {file_size_mb:.2f} MB")
print(f"{'='*60}")

Archivos a procesar: 316
Escribiendo en: seismic_LC01_HHZ_2022.csv
Modo: Escritura incremental (baja RAM)



Procesando:   2%|▏         | 7/316 [00:45<32:16,  6.27s/it]