# Process Data

This notebook goes through the steps to process raw .pkl files into a usable format by
1. Extracting data for each earthquake as well as earthquake metadata
2. Selecting a subset of stations out of all those with data to perform analysis
3. Compress data via subsampling

In [7]:
## Imports

import pickle
from pathlib import Path
import datetime
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib 
matplotlib.rc('font', **{'size': 18})
import numpy as np
import json

In [16]:
load_path = '../data/Ridgecrest/'
data_dir = "../data/"
raw_path = os.path.join(data_dir, "raw")
meta_path = os.path.join(data_dir, "metadata")
compressed_path = os.path.join(data_dir, "compressed")
relevant_path = os.path.join(data_dir, "relevant_stations")

### Step-1: Extract data for earthquakes

#### Part a: Extract accelerations

In [1]:
def get_indices(station_start_time, earthquake_start_time, period = 6000, step=100, offset=15):
    start_idx = int(100*(earthquake_start_time - station_start_time)) - offset*step
    end_idx = start_idx + period
    return (start_idx, end_idx)


In [None]:
input_dir = Path(load_path)

output_dir = Path(raw_path)
output_files = [os.path.split(f)[1].split('.')[0] for f in sorted(output_dir.glob("*.npy"))]
print(output_files)

if not os.path.exists(raw_path):
    os.mkdir(raw_path)

def np_from_pkl(data_path, save_path, period = 6000):
    input_dir = Path(data_path)
    sorted(input_dir.glob("*.pkl"))
    for f in tqdm(sorted(input_dir.glob("*.pkl"))):
#         print(f)
        day_id = os.path.split(f)[1].split('.')[0]
#         if day_id in output_files: 
#             print("File found for date: {}".format(day_id))
#             continue
        ## Iterating over pickles
        stations = []
        with open(f, 'rb') as fp:
            try:
                data = pickle.load(fp)
            except:
                print("Pickle Corrupted")
                print("Corrupted path: {}".format(f))
                continue
            num_stations, num_events = len(data['stations']), len(data['events'])
#             data_arr = np.zeros

            for station in data['stations']:
                events = []
                for event in data['events']:
                    start_idx, end_idx = get_indices(data['stations'][station]['starttime'], event['time'])
                    if (end_idx > len(data['stations'][station]['data']) - 1) or (start_idx < 0):
                        continue #skip event
                    events.append(np.transpose(data['stations'][station]['data'][start_idx: end_idx], (1, 0)))
                events = np.array(events)
#                 print("events shape", events.shape)
                stations.append(events)
        save_arr = np.array(stations) 
#         print("station shape: ", save_arr.shape)
        filename = day_id + ".npy"
        np.save(os.path.join(save_path, filename), save_arr) 
        
np_from_pkl(load_path, raw_path)


#### Part b: Extract metadata

In [None]:
## Reads through pickles. Saves stations and events (w/o the big boy quake data)    

if not os.path.exists(meta_path):
    os.mkdir(meta_path)
    
def save_station_and_events(data_path, save_path):
    dropped = []
    input_dir = Path(data_path)
    sorted(input_dir.glob("*.pkl"))
    for f in tqdm(sorted(input_dir.glob("*.pkl"))):
        print(f)
        day_id = os.path.split(f)[1].split('.')[0]
        ## Iterating over pickles
#         stations = []
        with open(f, 'rb') as fp:
            try:
                data = pickle.load(fp)
            except:
                print("Pickle Corrupted")
                print("Corrupted path: {}".format(f))
                continue
                
        first_stat = list(data['stations'].keys())[0]
        filename = day_id + "_meta.json"
                
        ## Idxs to keep
        idxs_to_keep = []
        for e, event in enumerate(data['events']):
            start_idx, end_idx = get_indices(data['stations'][first_stat]['starttime'], event['time'])
            if (end_idx > len(data['stations'][first_stat]['data']) - 1) or (start_idx < 0):
                ## Event was skipped
                pass
            else:
                idxs_to_keep.append(e)
        before = len(data['events'])
        data['events'] = [data['events'][idx] for idx in idxs_to_keep]
        after = len(data['events'])
        print("Num events dropped from date {}: {}".format(day_id, before - after))
        dropped.append(before - after)
        for station in data['stations']:
#             print(data['stations'][station].keys())
            for key in ['starttime', 'endtime', 'data']:
                del data['stations'][station][key]            
        for e, event in enumerate(data['events']):
            del data['events'][e]['time']
#         print(data)
        stations = np.array([stat for stat in data['stations'].keys()])
        print(stations)
        np.save(os.path.join(save_path, "{}_stations.npy".format(day_id)), stations)
        with open(os.path.join(save_path, filename), 'w') as fp:
            json.dump(data, fp)      
    with open(os.path.join(save_path, "dropped_events.txt"), 'w') as fp:
        fp.write(str(sum(dropped)) + '\n')
        for d in dropped:
            fp.write(str(d)+'\n')
        
save_station_and_events(load_path, meta_path)

#### Part c: Count number of quakes per station

In [24]:
station_files = [entry.name for entry in Path(meta_path).glob("*.npy")]

['2019-09-18_stations.npy',
 '2019-09-23_stations.npy',
 '2019-08-03_stations.npy',
 '2019-08-31_stations.npy',
 '2019-07-17_stations.npy',
 '2019-06-03_stations.npy',
 '2019-06-26_stations.npy',
 '2019-07-16_stations.npy',
 '2019-09-29_stations.npy',
 '2019-06-17_stations.npy',
 '2019-06-04_stations.npy',
 '2019-09-28_stations.npy',
 '2019-07-02_stations.npy',
 '2019-09-25_stations.npy',
 '2019-06-29_stations.npy',
 '2019-06-08_stations.npy',
 '2019-09-06_stations.npy',
 '2019-09-14_stations.npy',
 '2019-06-01_stations.npy',
 '2019-07-05_stations.npy',
 '2019-09-16_stations.npy',
 '2019-07-14_stations.npy',
 '2019-06-13_stations.npy',
 '2019-08-14_stations.npy',
 '2019-09-17_stations.npy',
 '2019-09-27_stations.npy',
 '2019-08-04_stations.npy',
 '2019-09-22_stations.npy',
 '2019-07-18_stations.npy',
 '2019-07-30_stations.npy',
 '2019-08-27_stations.npy',
 '2019-09-15_stations.npy',
 '2019-09-05_stations.npy',
 '2019-06-06_stations.npy',
 '2019-09-12_stations.npy',
 '2019-09-21_station

In [None]:
stations = []
for file in station_files:
    stations.append(np.load(os.path.join(meta_path, file)))

In [None]:
counts = {}
for v in stations:
    for stat in v:
        if stat not in counts:
            counts[stat] = 0
print("Stations: ", counts)
print(len(counts))

In [None]:
for s_idx, s in enumerate(shapes):
    num_events = s[1]
    assert s[0] == len(stations[dates[s_idx]])
    for stat in stations[dates[s_idx]]:
        counts[stat] += num_events
print(counts)

In [None]:
sorted_dict = sorted(zip(list(counts.values()), list(counts.keys())))
sorted_counts, sorted_stats = zip(*sorted_dict)
sorted_counts, sorted_stats = sorted_counts[::-1], sorted_stats[::-1]
plt.figure(figsize=(40,10))
plt.bar(sorted_stats, sorted_counts)
plt.xlabel('Stations'); plt.ylabel("Num events")
plt.show()

### Step-2: Filter relevant stations

In [None]:
num_stations = 15

with open('./all_station_counts.json') as json_file:
    stat_counts = json.load(json_file)

pprint(stat_counts)

In [None]:
sorted_dict = sorted(zip(list(stat_counts.values()), list(stat_counts.keys())))
sorted_counts, sorted_stats = zip(*sorted_dict)
sorted_counts, sorted_stats = sorted_counts[::-1], sorted_stats[::-1]
plt.bar(sorted_stats, sorted_counts)
plt.title("Station event counts")
plt.show()

master_stations = sorted_stats[:num_stations]
print(master_stations)
print(sorted_counts[:num_stations])

In [None]:
## Create array of dates
dates = sorted([entry.name.split(".")[0] for entry in list(os.scandir(raw_path)) if "npy" in entry.name])
print(sorted(dates))
print(len(dates))

In [None]:
if not os.path.exists(relevant_path):
    os.mkdir(relevant_path)

print("Extracting data for stations: {}".format(master_stations))
np.save(os.path.join(relevant_path, "stations.npy"), master_stations)


for date in tqdm(dates):
    date_data = os.path.join(raw_path, date+".npy")
#     print("Loading data from date {}".format(date))
    date_arr = np.load(date_data)
    
    ## look for stations for that day (metadata)
    station_meta = os.path.join(meta_path, date+"_stations.npy")
#     print("\tLoading metadata from {}".format(station_meta))
    date_stations = np.load(station_meta)
    delete_date = False
    stats_for_date = []
#     print("\tStations for date {}: {}".format(date, date_stations))

    for master in master_stations: 
        if master not in date_stations: 
            delete_date = True
            break            
        idx_date = date_arr[date_stations == master]
#         print(idx)
#         stats_for_date.append(date_arr[idx, :, :,])
        stats_for_date.append(idx_date)

    if delete_date is True: 
        print("\tDeleting date {}".format(date))
        continue
    stats_for_date = np.concatenate(stats_for_date, 0)
    assert stats_for_date.shape[0] == len(master_stations)
    date_save = os.path.join(relevant_path, "{}.npy".format(date))
    stats_for_date = np.transpose(stats_for_date, [1, 0, 2, 3]) #(num_evs, stats, dim, time)
    stats_for_date = np.linalg.norm(stats_for_date, ord=2, axis=2)
#     print(stats_for_date.shape)
    np.save(date_save, stats_for_date)


### Step-3: Compress via subsampling

In [None]:
if not os.path.exists(compressed_path):
    os.mkdir(compressed_path)
    
files = [entry.name for entry in os.scandir(relevant_path) if "2019" in entry.name]
print(files)
print(len(files))

WIDTH = 100
for file in files:
    print("Processing file: {}".format(file))
    large = np.load(os.path.join(load_path, file))
    print("\tBefore: ", large.shape)
    comp_shape = (large.shape[0], large.shape[1], large.shape[2] // WIDTH)
    compressed = np.zeros(comp_shape)
    for i in range(comp_shape[2]):
        compressed[:, :, i] = np.mean(large[:, :, WIDTH*i: WIDTH*(i+1)], 2)
    print("\tAfter: ", compressed.shape)
    np.save(os.path.join(compressed_path, file), compressed)
    

In [None]:
test_date = "2019-07-06.npy"
test_original = np.load(os.path.join(relevant_path, test_date))
test_compressed = np.load(os.path.join(compressed_path, test_date))

plt.plot(test[0, 0, :], alpha=0.25)
plt.show()
plt.plot(compressed[0, 0, :])
plt.show()