In [1]:
import os
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

DIR = os.getcwd()
data_path = os.path.join(DIR, 'data/STEAD')

In [2]:
window_count = 6
is_balance = True

def preprocessing(window_count, is_balance):
    if is_balance:
        t_count = 10000
        f_count = 10000
    else:
        t_count = 1000
        f_count = 9000
        
    window_size = 6000//window_count
    
    file_name = "chunk2.hdf5"
    csv_file = "chunk2.csv"

    # reading the csv file into a dataframe:
    df = pd.read_csv(os.path.join(data_path, csv_file))
    print(f'total events in csv file: {len(df)}')

    # filterering the dataframe
    # df = df[(df.trace_category == 'earthquake_local') & (df.source_distance_km <= 20) & (df.source_magnitude > 3)]
    df = df[(df.source_magnitude > 3)]
    print(f'total events selected: {len(df)}')

    # making a list of trace names for the selected data
    random_idx = np.random.choice(len(df),10000)
    ev_list = df['trace_name'].to_numpy()[random_idx]
    print('random choice length :', len(ev_list))

    # retrieving selected waveforms from the hdf5 file: 
    dtfl = h5py.File(os.path.join(data_path, file_name), 'r')

    outputs = list()
    
    for c, evi in tqdm(enumerate(ev_list)):
        dataset = dtfl.get('data/'+str(evi)) 

        data = np.array(dataset)
        p_time_label = np.zeros(window_count)
        s_time_label = np.zeros(window_count)


        p_time_label[int(dataset.attrs['p_arrival_sample']//window_size)] = 1
        s_time_label[int(dataset.attrs['s_arrival_sample']//window_size)] = 1

        outputs.append({
            'data' : data,
            'p_label' : 1,
            's_label' : 1,
            'p_time_label' : p_time_label,
            's_time_label' : s_time_label

        })
        
        
    file_name = "chunk1.hdf5"
    csv_file = "chunk1.csv"

    # reading the csv file into a dataframe:
    df = pd.read_csv(os.path.join(data_path, csv_file))
    print(f'total events in csv file: {len(df)}')

    # filterering the dataframe
    df = df[(df.trace_category == 'noise')]
    print(f'total events selected: {len(df)}')

    # making a list of trace names for the selected data
    random_idx = np.random.choice(len(ev_list),10000)
    ev_list = df['trace_name'].to_numpy()[random_idx]
    print('random choice length :', len(ev_list))

    # retrieving selected waveforms from the hdf5 file: 
    dtfl = h5py.File(os.path.join(data_path, file_name), 'r')

    for c, evi in tqdm(enumerate(ev_list)):
        dataset = dtfl.get('data/'+str(evi)) 

        data = np.array(dataset)
        p_time_label = np.zeros(window_count)
        s_time_label = np.zeros(window_count)

        outputs.append({
            'data' : data,
            'p_label' : 0,
            's_label' : 0,
            'p_time_label' : p_time_label,
            's_time_label' : s_time_label
        })
        
    return outputs

outputs = preprocessing(window_count, is_balance)

  if (await self.run_code(code, result,  async_=asy)):


total events in csv file: 200000
total events selected: 10148
random choice length : 10000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


total events in csv file: 235426
total events selected: 235426
random choice length : 10000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [3]:
import pickle
if is_balance:
    with open(os.path.join(DIR, 'labeled_dump_STEAD', 'STEAD_win' + str(window_count) + '_balance_dump.pkl'), 'wb') as f:
        pickle.dump(outputs, f)
else:
    with open(os.path.join(DIR, 'labeled_dump_STEAD', 'STEAD_win' + str(window_count) + '_imbalance_dump.pkl'), 'wb') as f:
        pickle.dump(outputs, f)
    