# Earthquake Detection Workflow

## Outline

Here we show an example of the current modules in QuakeFlow

1. Download data using Obpsy:

    [FDSN web service client for ObsPy](https://docs.obspy.org/packages/obspy.clients.fdsn.html#module-obspy.clients.fdsn)
    
    [Mass Downloader for FDSN Compliant Web Services](https://docs.obspy.org/packages/autogen/obspy.clients.fdsn.mass_downloader.html#module-obspy.clients.fdsn.mass_downloader)

2. PhaseNet for picking P/S phases

    Find more details in [PhaseNet github page](https://wayneweiqiang.github.io/PhaseNet/)

3. GaMMA for associating picking and estimate approximate location and magnitude

    Find more details in [GaMMA github page](https://wayneweiqiang.github.io/GMMA/)

4. Earthquake location, magnitude estimation, etc. (to be continued)


## 1. Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) and download packages

<!-- # %%capture -->
```bash
git clone https://github.com/wayneweiqiang/PhaseNet.git
git clone https://github.com/wayneweiqiang/GaMMA.git
conda env update -f=env.yml -n base
```

**Second option: install to quakeflow environment, but need to select jupyter notebook kernel to quakflow**
```bash
conda env create -f=env.yml -n quakeflow
python -m ipykernel install --user --name=quakeflow
```

In [816]:
import warnings

import kfp
import kfp.dsl as dsl
import kfp.components as comp
from kfp.components import InputPath, OutputPath

warnings.filterwarnings("ignore")


## 2. Set configurations

In [817]:
import os
import matplotlib

# matplotlib.use("agg")
import matplotlib.pyplot as plt

region_name = "Ridgecrest_demo"
# region_name = "Ridgecrest_oneweek"
# region_name = "SaltonSea"
# region_name = "Ridgecrest"
# region_name = "SanSimeon"
# region_name = "Italy"
# region_name = "PNSN"
# region_name = "Hawaii"
# region_name = "Hawaii_202111_to_202205"
# region_name = "PuertoRico"
# region_name = "SmithValley"
# region_name = "Antilles"
# region_name = "Test"
dir_name = region_name
if not os.path.exists(dir_name):
    os.mkdir(dir_name)
root_dir = lambda x: os.path.join(dir_name, x)

run_local = False


In [818]:
def set_config(
    index_json: OutputPath("json"),
    config_json: OutputPath("json"),
    datetime_json: OutputPath("json"),
    num_parallel: int = 1,
) -> list:

    import obspy
    import os
    import pickle
    import datetime
    import numpy as np
    import json

    pi = 3.1415926
    degree2km = pi * 6371 / 180

    region_name = "Ridgecrest_demo"
    center = (-117.504, 35.705)
    horizontal_degree = 1.0
    vertical_degree = 1.0
    starttime = obspy.UTCDateTime("2019-07-04T17")
    endtime = obspy.UTCDateTime("2019-07-04T19")
    client = "SCEDC"
    network_list = ["CI"]
    channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Ridgecrest_oneweek"
    # center = (-117.504, 35.705)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2019-07-04T00")
    # endtime = obspy.UTCDateTime("2019-07-10T00")
    # client = "SCEDC"
    # network_list = ["CI"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Test"
    # center = (-119.296, 34.443)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2022-02-25T00")
    # endtime = obspy.UTCDateTime("2022-03-02T22")
    # client = "SCEDC"
    # network_list = ["CI"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Ridgecrest"
    # center = (-117.504, 35.705)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2019-07-04T17")
    # endtime = obspy.UTCDateTime("2019-07-05T00")
    # # starttime = obspy.UTCDateTime("2019-07-04T00")
    # # endtime = obspy.UTCDateTime("2019-07-11T00")
    # client = "SCEDC"
    # network_list = ["CI"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Hawaii"
    # center = (-155.32, 19.39)
    # horizontal_degree = 2.0
    # vertical_degree = 2.0
    # starttime = obspy.UTCDateTime("2021-04-01T00")
    # endtime = obspy.UTCDateTime("2021-11-01T00")
    # client = "IRIS"
    # network_list = ["HV", "PT"]
    # channel_list = "HH*,BH*,EH*,HN*"


    # region_name = "Hawaii_202111_to_202205"
    # center = (-155.32, 19.39)
    # horizontal_degree = 2.0
    # vertical_degree = 2.0
    # starttime = obspy.UTCDateTime("2021-11-01T00")
    # endtime = obspy.UTCDateTime("2022-05-01T00")
    # client = "IRIS"
    # network_list = ["HV", "PT"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "PuertoRico"
    # center = (-66.5, 18)
    # horizontal_degree = 3.0
    # vertical_degree = 2.0
    # # starttime = obspy.UTCDateTime("2020-01-07T00")
    # # endtime = obspy.UTCDateTime("2020-01-14T00")
    # starttime = obspy.UTCDateTime("2018-05-01T00")
    # endtime = obspy.UTCDateTime("2021-11-01T00")
    # client = "IRIS"
    # network_list = ["*"]
    # # channel_list = "HH*,BH*,EH*,HN*"
    # # channel_list = "HH*,BH*,HN*"
    # channel_list = "HH*,BH*,HN*"

    # region_name = "SaltonSea"
    # center = (-115.53, 32.98)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2020-10-01T00")
    # endtime = obspy.UTCDateTime("2020-10-01T02")
    # client = "SCEDC"
    # network_list = ["CI"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "2003SanSimeon"
    # center = (-121.101, 35.701)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2003-12-22T00")
    # endtime = obspy.UTCDateTime("2003-12-24T00")
    # client = "NCEDC"
    # network_list = ["*"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Italy"
    # center = (13.188, 42.723)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2016-08-24T00")
    # endtime = obspy.UTCDateTime("2016-08-26T00")
    # client = "INGV"
    # network_list = ["*"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "SmithValley"
    # center = (-119.5, 38.51)
    # horizontal_degree = 1.0
    # vertical_degree = 1.0
    # starttime = obspy.UTCDateTime("2021-07-08T00:00")
    # endtime = obspy.UTCDateTime("2021-07-16T00:00")
    # client = "NCEDC"
    # network_list = ["*"]
    # channel_list = "HH*,BH*,EH*,HN*"

    # region_name = "Antilles"
    # center = (-61.14867, 14.79683)
    # horizontal_degree = 0.2
    # vertical_degree = 0.2
    # starttime = obspy.UTCDateTime("2021-04-10T00")
    # endtime = obspy.UTCDateTime("2021-04-15T00")
    # client = "RESIF"
    # network_list = ["*"]
    # channel_list = "HH*,BH*,EH*,HN*"

    ####### save config ########
    config = {}
    config["region"] = region_name
    config["center"] = center
    config["xlim_degree"] = [
        center[0] - horizontal_degree / 2,
        center[0] + horizontal_degree / 2,
    ]
    config["ylim_degree"] = [
        center[1] - vertical_degree / 2,
        center[1] + vertical_degree / 2,
    ]
    config["degree2km"] = degree2km
    config["starttime"] = starttime.datetime.isoformat()
    config["endtime"] = endtime.datetime.isoformat()
    config["networks"] = network_list
    config["channels"] = channel_list
    config["client"] = client

    with open(config_json, 'w') as fp:
        json.dump(config, fp)

    one_day = datetime.timedelta(days=1)
    one_hour = datetime.timedelta(hours=1)
    starttimes = []
    tmp_start = starttime
    while tmp_start < endtime:
        starttimes.append(tmp_start.datetime.isoformat())
        tmp_start += one_hour

    with open(datetime_json, "w") as fp:
        json.dump({"starttimes": starttimes, "interval": one_hour.total_seconds()}, fp)

    if num_parallel == 0:
        # num_parallel = min(60, len(starttimes)//6)
        num_parallel = min(60, len(starttimes))
        # num_parallel = 24

    idx = [[] for i in range(num_parallel)]
    for i in range(len(starttimes)):
        idx[i - i // num_parallel * num_parallel].append(i)

    with open(index_json, 'w') as fp:
        json.dump(idx, fp)

    return list(range(num_parallel))


In [819]:
if run_local:
    idx = set_config(root_dir("index.json"), root_dir("config.json"), root_dir("datetimes.json"), num_parallel=1,)

In [820]:
config_op = comp.func_to_container_op(
    set_config,
    # base_image='zhuwq0/quakeflow-env:latest',
    base_image='python:3.8',
    packages_to_install=["numpy", "obspy",],
)


## 3. Download events in the routine catalog

This catalog is not used by QuakeFolow. It is only used for comparing detection results.

In [821]:
def download_events(config_json: InputPath("json"), event_csv: OutputPath(str)):

    import pickle, os
    import obspy
    from obspy.clients.fdsn import Client
    from collections import defaultdict
    import pandas as pd
    import json
    import matplotlib

    #     matplotlib.use("agg")
    import matplotlib.pyplot as plt

    with open(config_json, "r") as fp:
        config = json.load(fp)

    ####### IRIS catalog ########
    try:
        events = Client(config["client"]).get_events(
            starttime=config["starttime"],
            endtime=config["endtime"],
            minlongitude=config["xlim_degree"][0],
            maxlongitude=config["xlim_degree"][1],
            minlatitude=config["ylim_degree"][0],
            maxlatitude=config["ylim_degree"][1],
            # filename='events.xml',
        )
    except:
        events = Client("iris").get_events(
            starttime=config["starttime"],
            endtime=config["endtime"],
            minlongitude=config["xlim_degree"][0],
            maxlongitude=config["xlim_degree"][1],
            minlatitude=config["ylim_degree"][0],
            maxlatitude=config["ylim_degree"][1],
            # filename='events.xml',
        )

    #     events = obspy.read_events('events.xml')
    print(f"Number of events: {len(events)}")
    #     events.plot('local', outfile="events.png")
    #     events.plot('local')

    ####### Save catalog ########
    catalog = defaultdict(list)
    for event in events:
        if len(event.magnitudes) > 0:
            catalog["time"].append(event.origins[0].time.datetime)
            catalog["magnitude"].append(event.magnitudes[0].mag)
            catalog["longitude"].append(event.origins[0].longitude)
            catalog["latitude"].append(event.origins[0].latitude)
            catalog["depth(m)"].append(event.origins[0].depth)
    catalog = pd.DataFrame.from_dict(catalog).sort_values(["time"])
    catalog.to_csv(
        event_csv,
        sep="\t",
        index=False,
        float_format="%.3f",
        date_format='%Y-%m-%dT%H:%M:%S.%f',
        columns=["time", "magnitude", "longitude", "latitude", "depth(m)"],
    )

    ####### Plot catalog ########
    plt.figure()
    plt.plot(catalog["longitude"], catalog["latitude"], '.', markersize=1)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.axis("scaled")
    plt.xlim(config["xlim_degree"])
    plt.ylim(config["ylim_degree"])
    #     plt.savefig(os.path.join(data_path, "events_loc.png"))
    plt.show()

    plt.figure()
    plt.plot_date(catalog["time"], catalog["magnitude"], '.', markersize=1)
    plt.gcf().autofmt_xdate()
    plt.ylabel("Magnitude")
    plt.title(f"Number of events: {len(events)}")
    plt.savefig(os.path.join("events_mag_time.png"))
    plt.show()


In [822]:
if run_local:
    download_events(root_dir("config.json"), root_dir("events.csv"))


In [823]:
download_events_op = comp.func_to_container_op(
    download_events,
    # base_image='zhuwq0/quakeflow-env:latest',
    base_image='python:3.8',
    packages_to_install=["obspy", "pandas", "matplotlib",],
)


## 4. Download stations

In [824]:
def download_stations(
    config_json: InputPath("json"), station_csv: OutputPath(str), station_pkl: OutputPath("pickle"),
):

    import pickle, os
    import obspy
    from obspy.clients.fdsn import Client
    from collections import defaultdict
    import pandas as pd
    import json
    import matplotlib

    #     matplotlib.use("agg")
    import matplotlib.pyplot as plt

    with open(config_json, "r") as fp:
        config = json.load(fp)

    print("Network:", ",".join(config["networks"]))
    ####### Download stations ########
    stations = Client(config["client"]).get_stations(
        network=",".join(config["networks"]),
        station="*",
        starttime=config["starttime"],
        endtime=config["endtime"],
        minlongitude=config["xlim_degree"][0],
        maxlongitude=config["xlim_degree"][1],
        minlatitude=config["ylim_degree"][0],
        maxlatitude=config["ylim_degree"][1],
        channel=config["channels"],
        level="response",
    )  # ,
    #                                            filename="stations.xml")

    #     stations = obspy.read_inventory("stations.xml")
    print("Number of stations: {}".format(sum([len(x) for x in stations])))
    # stations.plot('local', outfile="stations.png")
    #     stations.plot('local')

    ####### Save stations ########
    station_locs = defaultdict(dict)
    for network in stations:
        for station in network:
            for chn in station:
                sid = f"{network.code}.{station.code}.{chn.location_code}.{chn.code[:-1]}"
                if sid in station_locs:
                    station_locs[sid]["component"] += f",{chn.code[-1]}"
                    station_locs[sid]["response"] += f",{chn.response.instrument_sensitivity.value:.2f}"
                else:
                    component = f"{chn.code[-1]}"
                    response = f"{chn.response.instrument_sensitivity.value:.2f}"
                    dtype = chn.response.instrument_sensitivity.input_units.lower()
                    tmp_dict = {}
                    (tmp_dict["longitude"], tmp_dict["latitude"], tmp_dict["elevation(m)"],) = (
                        chn.longitude,
                        chn.latitude,
                        chn.elevation,
                    )
                    tmp_dict["component"], tmp_dict["response"], tmp_dict["unit"] = (
                        component,
                        response,
                        dtype,
                    )
                    station_locs[sid] = tmp_dict

    station_locs = pd.DataFrame.from_dict(station_locs, orient='index')
    station_locs.to_csv(
        station_csv,
        sep="\t",
        float_format="%.3f",
        index_label="station",
        columns=["longitude", "latitude", "elevation(m)", "unit", "component", "response",],
    )

    with open(station_pkl, "wb") as fp:
        pickle.dump(stations, fp)

    #     ####### Plot stations ########
    plt.figure()
    plt.plot(station_locs["longitude"], station_locs["latitude"], "^", label="Stations")
    plt.xlabel("X (km)")
    plt.ylabel("Y (km)")
    plt.axis("scaled")
    plt.xlim(config["xlim_degree"])
    plt.ylim(config["ylim_degree"])
    plt.legend()
    plt.title(f"Number of stations: {len(station_locs)}")
    #     plt.savefig(os.path.join(data_path, "stations_loc.png"))
    plt.show()


In [825]:
if run_local:
    download_stations(root_dir("config.json"), root_dir("stations.csv"), root_dir("stations.pkl"))


In [826]:
download_stations_op = comp.func_to_container_op(
    download_stations,
    # base_image='zhuwq0/quakeflow-env:latest',
    base_image='python:3.8',
    packages_to_install=["obspy", "pandas", "matplotlib",],
)


## 5. Download waveform data

In [827]:
def download_waveform(
    i: int,
    index_json: InputPath("json"),
    config_json: InputPath("json"),
    datetime_json: InputPath("json"),
    station_pkl: InputPath("pickle"),
    fname_csv: OutputPath(str),
    data_path: str,
    bucket_name: str = "waveforms",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
) -> str:

    import pickle, os
    import obspy
    from obspy.clients.fdsn import Client
    import time
    import json
    import random
    import threading

    lock = threading.Lock()

    upload_minio = False
    # try:
    #     from minio import Minio

    #     minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
    #     if not minioClient.bucket_exists(bucket_name):
    #         minioClient.make_bucket(bucket_name)
    #     upload_minio = True
    # except Exception as err:
    #     # print(f"ERROR: can not access minio service! \n{err}")
    #     pass

    with open(index_json, "r") as fp:
        index = json.load(fp)
    idx = index[i]
    with open(config_json, "r") as fp:
        config = json.load(fp)
    with open(datetime_json, "r") as fp:
        tmp = json.load(fp)
        starttimes = tmp["starttimes"]
        interval = tmp["interval"]
    with open(station_pkl, "rb") as fp:
        stations = pickle.load(fp)

    waveform_dir = os.path.join(data_path, config["region"], "waveforms")
    if not os.path.exists(waveform_dir):
        os.makedirs(waveform_dir)

    ####### Download data ########
    client = Client(config["client"])
    fname_list = ["fname"]

    def download(i):
        #     for i in idx:
        starttime = obspy.UTCDateTime(starttimes[i])
        endtime = starttime + interval
        fname = "{}.mseed".format(starttime.datetime.strftime("%Y-%m-%dT%H:%M:%S"))
        if not upload_minio:
            if os.path.exists(os.path.join(waveform_dir, fname)):
                print(f"{fname} exists")
                fname_list.append(fname)
                return
        else:
            try:
                minioClient.fget_object(
                    bucket_name, os.path.join(config['region'], fname), os.path.join(waveform_dir, fname),
                )
                print(
                    f"{bucket_name}/{os.path.join(config['region'], fname)} download to {os.path.join(waveform_dir, fname)}"
                )
                fname_list.append(fname)
                return
            except Exception as err:
                print(err)

        max_retry = 10
        stream = obspy.Stream()
        print(f"{fname} download starts")
        num_sta = 0
        for network in stations:
            for station in network:
                print(f"********{network.code}.{station.code}********")
                retry = 0
                while retry < max_retry:
                    try:
                        tmp = client.get_waveforms(
                            network.code, station.code, "*", config["channels"], starttime, endtime,
                        )
                        #  for trace in tmp:
                        #      if trace.stats.sampling_rate != 100:
                        #          print(trace)
                        #          trace = trace.interpolate(100, method="linear")
                        #      trace = trace.detrend("spline", order=2, dspline=5*trace.stats.sampling_rate)
                        #      stream.append(trace)
                        stream += tmp
                        num_sta += len(tmp)
                        break
                    except Exception as err:
                        print("Error {}.{}: {}".format(network.code, station.code, err))
                        message = "No data available for request."
                        if str(err)[: len(message)] == message:
                            break
                        retry += 1
                        time.sleep(5)
                        continue
                if retry == max_retry:
                    print(f"{fname}: MAX {max_retry} retries reached : {network.code}.{station.code}")

        if len(stream) > 0:
            # stream = stream.merge(fill_value=0)
            # stream = stream.trim(starttime, endtime, pad=True, fill_value=0)
            stream.write(os.path.join(waveform_dir, fname))
            print(f"{fname} download succeeds")
            # if upload_minio:
            #     minioClient.fput_object(bucket_name, os.path.join(config['region'], fname), os.path.join(waveform_dir, fname))
            #     print(f"{fname} upload to minio {os.path.join(config['region'], fname)}")
        else:
            print(f"{fname} empty data")
        lock.acquire()
        fname_list.append(fname)
        lock.release()

    threads = []
    MAX_THREADS = 4
    # MAX_THREADS = 1
    for ii, i in enumerate(idx):
        t = threading.Thread(target=download, args=(i,))
        t.start()
        time.sleep(1)
        threads.append(t)
        if ii % MAX_THREADS == MAX_THREADS - 1:
            for t in threads:
                t.join()
            threads = []
    for t in threads:
        t.join()

    with open(fname_csv, "w") as fp:
        fp.write("\n".join(fname_list))

    return waveform_dir


In [828]:
if run_local:
    waveform_path = download_waveform(
        0,
        root_dir("index.json"),
        root_dir("config.json"),
        root_dir("datetimes.json"),
        root_dir("stations.pkl"),
        root_dir("fname.csv"),
        data_path=root_dir(""),
    )


In [829]:
download_waveform_op = comp.func_to_container_op(
    download_waveform,
    base_image='python:3.8',
    packages_to_install=["obspy", "minio"],
)


In [830]:
def phasenet_op(data_path: str, data_list: str, stations: str):

    return dsl.ContainerOp(
        name='PhaseNet Picking',
        image="zhuwq0/phasenet-api:1.0",
        command=['python'],
        arguments=[
            'phasenet/predict.py',
            '--model',
            "model/190703-214543",
            '--data_dir',
            data_path,
            '--data_list',
            dsl.InputArgumentPath(data_list),
            '--stations',
            dsl.InputArgumentPath(stations),
            # '--result_dir', "results",
            '--format',
            "mseed_array",
            '--amplitude',
            '--upload_waveform',
        ],
        # file_outputs={"picks": "/opt/results/picks.json"},
        file_outputs={"picks": "/opt/results/picks.csv"},
    )


## 6. Run PhaseNet to pick P/S picks

In [831]:
# %%capture
if run_local:
    command = f"python ../PhaseNet/phasenet/predict.py --model=../PhaseNet/model/190703-214543 --data_dir={root_dir(root_dir('waveforms'))} --data_list={root_dir('fname.csv')} --stations={root_dir('stations.csv')} --result_dir={root_dir('phasenet')} --format=mseed_array --amplitude --upload_waveform"
    print(command)
    !{command}

## 7. Run GaMMA to associate P/S picks

In [832]:
def gamma(
    i: int,
    index_json: InputPath("json"),
    config_json: InputPath("json"),
    pick_csv: InputPath("csv"),
    station_csv: InputPath(str),
    catalog_csv: OutputPath(str),
    picks_csv: OutputPath(str),
    bucket_name: str = "catalogs",
    s3_url: str = "localhost:9000",
    secure: bool = True,
) -> str:

    import pandas as pd
    from datetime import datetime, timedelta
    import numpy as np
    from datetime import datetime, timedelta
    import os
    import json
    import pickle
    from tqdm import tqdm
    from gamma.utils import from_seconds, convert_picks_csv, association

    catalog_dir = os.path.join("/tmp/", bucket_name)
    if not os.path.exists(catalog_dir):
        os.makedirs(catalog_dir)

    ## read config
    with open(index_json, "r") as fp:
        index = json.load(fp)
    idx = index[i]

    with open(config_json, "r") as fp:
        config = json.load(fp)
    config["x(km)"] = (np.array(config["xlim_degree"]) - np.array(config["center"][0])) * config["degree2km"]
    config["y(km)"] = (np.array(config["ylim_degree"]) - np.array(config["center"][1])) * config["degree2km"]
    config["z(km)"] = (0, 60)

    ## read picks
    picks = pd.read_csv(pick_csv, parse_dates=["timestamp"])
    picks["id"] = picks["station_id"]
    picks["time_idx"] = picks["timestamp"].apply(lambda x: x.strftime("%Y-%m-%dT%H"))  ## process by hours

    ## read stations
    stations = pd.read_csv(station_csv, delimiter="\t")
    stations = stations.rename(columns={"station": "id"})
    stations["x(km)"] = stations["longitude"].apply(lambda x: (x - config["center"][0]) * config["degree2km"])
    stations["y(km)"] = stations["latitude"].apply(lambda x: (x - config["center"][1]) * config["degree2km"])
    stations["z(km)"] = stations["elevation(m)"].apply(lambda x: -x / 1e3)

    ### setting GMMA configs
    config["dims"] = ['x(km)', 'y(km)', 'z(km)']
    config["use_amplitude"] = True
    config["vel"] = {"p": 6.0, "s": 6.0 / 1.73}
    config["method"] = "BGMM"
    if config["method"] == "BGMM":
        config["oversample_factor"] = 4
    if config["method"] == "GMM":
        config["oversample_factor"] = 1

    # DBSCAN
    config["bfgs_bounds"] = (
        (config["x(km)"][0] - 1, config["x(km)"][1] + 1),  # x
        (config["y(km)"][0] - 1, config["y(km)"][1] + 1),  # y
        (0, config["z(km)"][1] + 1),  # x
        (None, None),
    )  # t
    config["dbscan_eps"] = min(
        15,
        np.sqrt(
            (stations["x(km)"].max() - stations["x(km)"].min()) ** 2
            + (stations["y(km)"].max() - stations["y(km)"].min()) ** 2
        )
        / (6.0 / 1.75),
    )  # s
    config["dbscan_min_samples"] = min(3, len(stations))

    # Filtering
    config["min_picks_per_eq"] = min(8, len(stations) // 2)
    config["max_sigma11"] = 2.0  # s
    config["max_sigma22"] = 2.0  # m/s
    config["max_sigma12"] = 1.0  # covariance

    # print(config)
    for k, v in config.items():
        print(f"{k}: {v}")

    ## if use amplitude
    if config["use_amplitude"]:
        picks = picks[picks["amp"] != 0]

    ## run GMMA association
    pbar = tqdm(sorted(list(set(picks["time_idx"]))))
    event_idx0 = 1  ## current earthquake index
    assignments = []
    if (len(picks) > 0) and (len(picks) < 5000):
        catalogs, assignments = association(picks, stations, config, event_idx0, method=config["method"], pbar=pbar,)
        event_idx0 += len(catalogs)
    else:
        catalogs = []
        for i, segment in enumerate(pbar):
            picks_ = picks[picks["time_idx"] == segment]
            if len(picks_) == 0:
                continue
            catalog, assign = association(picks_, stations, config, event_idx0, method=config["method"], pbar=pbar,)
            event_idx0 += len(catalog)
            catalogs.extend(catalog)
            assignments.extend(assign)

    ## create catalog
    catalogs = pd.DataFrame(
        catalogs,
        columns=["time(s)"]
        + config["dims"]
        + ["magnitude", "sigma_time", "sigma_amp", "cov_time_amp", "event_idx", "prob_gamma",],
    )
    catalogs["time"] = catalogs["time(s)"].apply(lambda x: from_seconds(x))
    catalogs["longitude"] = catalogs["x(km)"].apply(lambda x: x / config["degree2km"] + config["center"][0])
    catalogs["latitude"] = catalogs["y(km)"].apply(lambda x: x / config["degree2km"] + config["center"][1])
    catalogs["depth(m)"] = catalogs["z(km)"].apply(lambda x: x * 1e3)

    catalogs.sort_values(by=["time"], inplace=True)
    with open(catalog_csv, 'w') as fp:
        catalogs.to_csv(
            fp,
            sep="\t",
            index=False,
            float_format="%.3f",
            date_format='%Y-%m-%dT%H:%M:%S.%f',
            columns=[
                "time",
                "magnitude",
                "longitude",
                "latitude",
                "depth(m)",
                "sigma_time",
                "sigma_amp",
                "cov_time_amp",
                "prob_gamma",
                "event_idx",
            ],
        )
    # catalogs = catalogs[
    #     ['time', 'magnitude', 'longitude', 'latitude', 'depth(m)', 'sigma_time', 'sigma_amp']
    # ]

    ## add assignment to picks
    assignments = pd.DataFrame(assignments, columns=["pick_idx", "event_idx", "prob_gamma"])
    picks = picks.join(assignments.set_index("pick_idx")).fillna(-1).astype({'event_idx': int})
    picks.sort_values(by=["timestamp"], inplace=True)
    with open(picks_csv, 'w') as fp:
        picks.to_csv(
            fp,
            sep="\t",
            index=False,
            date_format='%Y-%m-%dT%H:%M:%S.%f',
            columns=["id", "timestamp", "type", "prob", "amp", "prob_gamma", "event_idx",],
        )

    ## upload to mongodb
    try:
        from pymongo import MongoClient

        username = "root"
        password = "quakeflow123"
        mongodb_url = "quakeflow-mongodb.default.svc.cluster.local:27017"
        client = MongoClient(f"mongodb://{username}:{password}@{mongodb_url}")
        db = client["quakeflow"]
        collection = db["waveform"]
        for i, p in tqdm(picks.iterrows(), desc="Uploading to mongodb"):
            collection.update(
                {"_id": f"{p['station_id']}_{p['timestamp'].isoformat(timespec='milliseconds')}_{p['type']}"},
                {"$set": {"event_index": p["event_idx"]}},
            )
    except Exception as err:
        print(f"ERROR: can not access mongodb service! \n{err}")
        pass

    ## upload to s3 bucket
    try:
        from minio import Minio

        minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
        if not minioClient.bucket_exists(bucket_name):
            minioClient.make_bucket(bucket_name)

        with open(os.path.join(catalog_dir, f"catalog_{idx[0]:04d}.csv"), 'w') as fp:
            catalogs.to_csv(
                fp,
                sep="\t",
                index=False,
                float_format="%.3f",
                date_format='%Y-%m-%dT%H:%M:%S.%f',
                columns=[
                    "time",
                    "magnitude",
                    "longitude",
                    "latitude",
                    "depth(m)",
                    "sigma_time",
                    "sigma_amp",
                    "cov_time_amp",
                    "prob_gamma",
                    "event_idx",
                ],
            )
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/catalog_{idx[0]:04d}.csv",
            os.path.join(catalog_dir, f"catalog_{idx[0]:04d}.csv"),
        )

        with open(os.path.join(catalog_dir, f"picks_{idx[0]:04d}.csv"), 'w') as fp:
            picks.to_csv(
                fp,
                sep="\t",
                index=False,
                date_format='%Y-%m-%dT%H:%M:%S.%f',
                columns=["id", "timestamp", "type", "prob", "amp", "prob_gamma", "event_idx",],
            )
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/picks_{idx[0]:04d}.csv",
            os.path.join(catalog_dir, f"picks_{idx[0]:04d}.csv"),
        )

    except Exception as err:
        print(f"ERROR: can not access minio service! \n{err}")
        pass

    return f"catalog_{idx[0]:04d}.csv"

In [833]:
if run_local:
    catalog = gamma(
        0,
        root_dir("index.json"),
        root_dir("config.json"),
        root_dir("phasenet/picks.csv"),
        root_dir("stations.csv"),
        root_dir("catalog.csv"),
        root_dir("picks.csv"),
        bucket_name="catalogs",
        s3_url="localhost:9000",
        secure=False,
    )


In [834]:
gamma_op = comp.func_to_container_op(
    gamma,
    base_image='python:3.8',
    packages_to_install=["pandas", "numpy", "scikit-learn", "tqdm", "minio", "gmma", "pymongo"],
)


## 8. Plot catalogs

In [835]:
if run_local:
    # %run plot_catalog.ipynb
    import pandas 
    import matplotlib.pyplot as plt
    gamma_catalog = pandas.read_csv(root_dir("catalog.csv"), sep="\t")
    plt.figure()
    plt.plot(gamma_catalog["longitude"], gamma_catalog["latitude"], '.')
    plt.show()

## 9. Parallel processing on cloud

Only run this section for parallel jobs on cloud. Setting cloud environment is needed.

In [836]:
def merge_catalog(
    config_json: InputPath("json"),
    catalog_csv: OutputPath(str),
    picks_csv: OutputPath(str),
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
):

    import pandas as pd
    from glob import glob
    import os
    import json

    from minio import Minio

    minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)

    with open(config_json, "r") as fp:
        config = json.load(fp)

    objects = minioClient.list_objects(bucket_name, prefix=config["region"], recursive=True)

    tmp_path = lambda x: os.path.join("/tmp/", x)
    for obj in objects:
        print(obj._object_name)
        minioClient.fget_object(bucket_name, obj._object_name, tmp_path(obj._object_name.split("/")[-1]))

    files_catalog = sorted(glob(tmp_path("catalog_*.csv")))
    files_picks = sorted(glob(tmp_path("picks_*.csv")))

    if len(files_catalog) > 0:
        catalog_list = []
        for f in files_catalog:
            tmp = pd.read_csv(f, sep="\t", dtype=str)
            tmp["file_idx"] = f.rstrip(".csv").split("_")[-1]
            catalog_list.append(tmp)
        merged_catalog = pd.concat(catalog_list).sort_values(by="time")

        pick_list = []
        for f in files_picks:
            tmp = pd.read_csv(f, sep="\t", dtype=str)
            tmp["file_idx"] = f.rstrip(".csv").split("_")[-1]
            pick_list.append(tmp)
        merged_picks = pd.concat(pick_list).sort_values(by="timestamp")

        merged_catalog["match_id"] = merged_catalog.apply(lambda x: f'{x["event_idx"]}_{x["file_idx"]}', axis=1)
        merged_picks["match_id"] = merged_picks.apply(lambda x: f'{x["event_idx"]}_{x["file_idx"]}', axis=1)
        merged_catalog.sort_values(by="time", inplace=True, ignore_index=True)

        merged_catalog.drop(columns=["event_idx", "file_idx"], inplace=True)
        merged_picks.drop(columns=["event_idx", "file_idx"], inplace=True)
        merged_catalog["event_idx"] = merged_catalog.index.values
        mapping = dict(zip(merged_catalog["match_id"], merged_catalog["event_idx"]))
        merged_picks["event_idx"] = merged_picks["match_id"].apply(lambda x: mapping[x] if x in mapping else -1)
        merged_catalog.drop(columns=["match_id"], inplace=True)
        merged_picks.drop(columns=["match_id"], inplace=True)

        merged_catalog.to_csv(tmp_path("gamma_catalog.csv"), sep="\t", index=False)
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/gamma_catalog.csv",
            tmp_path("gamma_catalog.csv"),
        )
        merged_picks.to_csv(tmp_path("gamma_picks.csv"), sep="\t", index=False)
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/gamma_picks.csv",
            tmp_path("gamma_picks.csv"),
        )

        with open(catalog_csv, "w") as fout:
            with open(tmp_path("gamma_catalog.csv"), "r") as fin:
                for line in fin:
                    fout.write(line)
        with open(picks_csv, "w") as fout:
            with open(tmp_path("gamma_picks.csv"), "r") as fin:
                for line in fin:
                    fout.write(line)
    else:
        with open(catalog_csv, "w") as fout:
            pass
        print("No catalog.csv found!")
        with open(picks_csv, "w") as fout:
            pass
        print("No picks.csv found!")


In [837]:
merge_op = comp.func_to_container_op(
    merge_catalog,
    base_image='python:3.8',
    packages_to_install=["pandas", "minio"],
)


In [838]:
def split_hypodd(
    config_json: InputPath("json"),
    picks_csv: InputPath(str),
    catalog_csv: InputPath(str),
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
) -> list:

    import pandas as pd
    import json
    from tqdm import tqdm
    from datetime import datetime
    import os

    with open(config_json, "r") as fp:
        config = json.load(fp)

    picks = pd.read_csv(picks_csv, sep="\t")
    events = pd.read_csv(catalog_csv, sep="\t")
    tmp_path = lambda x: os.path.join("/tmp/", x)

    events["match_id"] = events["event_idx"]
    picks["match_id"] = picks["event_idx"]

    # %%
    # MAXEVENT = len(events)
    MAXEVENT = 1e4  ## segment by time
    MAXEVENT = len(events) // ((len(events) - 1) // MAXEVENT + 1) + 1

    # %% convert format
    idx = 0
    out_file = open(tmp_path(f"hypoDD_{idx:03d}.pha"), "w")

    picks_by_event = picks.groupby("match_id").groups
    for i in tqdm(range(len(events))):
        if i % MAXEVENT == MAXEVENT - 1:
            out_file.close()
            idx = int((i + 1) // MAXEVENT)
            out_file = open(tmp_path(f"hypoDD_{idx:03d}.pha"), "w")

        event = events.iloc[i]
        event_time = datetime.strptime(event["time"], "%Y-%m-%dT%H:%M:%S.%f")
        lat = event["latitude"]
        lng = event["longitude"]
        dep = event["depth(m)"] / 1e3
        mag = event["magnitude"]
        EH = 0
        EZ = 0
        RMS = event["sigma_time"]

        year, month, day, hour, min, sec = (
            event_time.year,
            event_time.month,
            event_time.day,
            event_time.hour,
            event_time.minute,
            float(event_time.strftime("%S.%f")),
        )
        event_line = f"# {year:4d} {month:2d} {day:2d} {hour:2d} {min:2d} {sec:5.2f}  {lat:7.4f} {lng:9.4f}   {dep:5.2f} {mag:5.2f} {EH:5.2f} {EZ:5.2f} {RMS:5.2f} {event['event_idx']:9d}\n"
        out_file.write(event_line)

        picks_idx = picks_by_event[event["match_id"]]
        for j in picks_idx:
            pick = picks.iloc[j]
            network_code, station_code, comp_code, channel_code = pick['id'].split('.')
            phase_type = pick['type'].upper()
            phase_weight = pick['prob']
            pick_time = (datetime.strptime(pick["timestamp"], "%Y-%m-%dT%H:%M:%S.%f") - event_time).total_seconds()
            # if pick_time <= 0:
            #     continue
            # pick_line = f"{station_code:<5s}    {pick_time:8.3f}   {phase_weight:5.4f}   {phase_type}\n"
            tmp_code = f"{station_code}{channel_code}"
            pick_line = f"{tmp_code:<7s}   {pick_time:6.3f}   {phase_weight:5.4f}   {phase_type}\n"
            out_file.write(pick_line)

    out_file.close()

    try:
        from minio import Minio

        minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
        for i in range(idx + 1):
            minioClient.fput_object(
                bucket_name, f"{config['region']}/hypoDD_{i:03d}.pha", tmp_path(f"hypoDD_{i:03d}.pha")
            )

    except Exception as err:
        print(f"ERROR: can not access minio service! \n{err}")
        pass

    return list(range(idx + 1))


In [839]:
if run_local:
    num_split = split_hypodd(root_dir("config.json"), root_dir("picks.csv"), root_dir("catalog.csv"))
    # print(split_hypodd(root_dir("config.json"), root_dir("gamma_picks.csv"), root_dir("gamma_catalog.csv")))

In [840]:
split_hypodd_op = comp.func_to_container_op(
    split_hypodd,
    base_image='python:3.8',
    packages_to_install=["pandas", "tqdm", "minio"],
)

In [841]:
def convert_station(
    station_csv: InputPath(str),
    hypoinverse_station: OutputPath(str),
    hypodd_station: OutputPath(str),
):

    import pandas as pd
    from tqdm import tqdm

    # %%
    # stations = pd.read_csv('stations.csv', sep="\t")
    stations = pd.read_csv(station_csv, sep="\t")

    converted_hypoinverse = []
    converted_hypoDD = {}

    for i in tqdm(range(len(stations))):

        network_code, station_code, comp_code, channel_code = stations.iloc[i]['station'].split('.')
        station_weight = " "
        lat_degree = int(stations.iloc[i]['latitude'])
        lat_minute = (stations.iloc[i]['latitude'] - lat_degree) * 60
        north = "N" if lat_degree >= 0 else "S"
        lng_degree = int(stations.iloc[i]['longitude'])
        lng_minute = (stations.iloc[i]['longitude'] - lng_degree) * 60
        west = "W" if lng_degree <= 0 else "E"
        elevation = stations.iloc[i]['elevation(m)']
        line_hypoinverse = f"{station_code:<5} {network_code:<2} {comp_code[:-1]:<1}{channel_code:<3} {station_weight}{abs(lat_degree):2.0f} {abs(lat_minute):7.4f}{north}{abs(lng_degree):3.0f} {abs(lng_minute):7.4f}{west}{elevation:4.0f}\n"
        # line_hypoDD = f"{network_code:<2}.{station_code:<5} {stations.iloc[i]['latitude']:.3f}, {stations.iloc[i]['longitude']:.3f}\n"
        #line_hypoDD = f"{station_code} {stations.iloc[i]['latitude']:.3f} {stations.iloc[i]['longitude']:.3f}\n"
        converted_hypoinverse.append(line_hypoinverse)
        #converted_hypoDD.append(line_hypoDD)
        # converted_hypoDD[f"{station_code}"] = f"{station_code} {stations.iloc[i]['latitude']:.3f} {stations.iloc[i]['longitude']:.3f}\n"
        tmp_code = f"{station_code}{channel_code}"
        converted_hypoDD[f"{station_code}{channel_code}"] = f"{tmp_code:<8s} {stations.iloc[i]['latitude']:.3f} {stations.iloc[i]['longitude']:.3f}\n"

    # %%
    # out_file = 'stations_hypoinverse.dat'
    with open(hypoinverse_station, 'w') as f:
        f.writelines(converted_hypoinverse)

    # out_file = 'stations_hypoDD.dat'
    with open(hypodd_station, 'w') as f:
        for k, v in converted_hypoDD.items():
            f.write(v)

In [842]:
if run_local:
    convert_station(root_dir("stations.csv"), root_dir("stations_hypoinverse.dat"), root_dir("stations_hypoDD.dat"))

In [843]:
convert_station_op = comp.func_to_container_op(
    convert_station,
    base_image='python:3.8',
    packages_to_install=["pandas", "tqdm"],
)

In [844]:
def ph2dt(
    i: int,
    config_json: InputPath("json"),
    station_dat: InputPath(str),
    ct_file: OutputPath(str),
    hypodd_event: OutputPath(str),
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
) -> str:
    import json
    from datetime import datetime
    import os
    from minio import Minio

    with open(config_json, "r") as fp:
        config = json.load(fp)
    tmp_path = lambda x: os.path.join("/tmp/", x)

    try:
        minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
        minioClient.fget_object(bucket_name, f"{config['region']}/hypoDD_{i:03d}.pha", tmp_path(f"hypoDD_{i:03d}.pha"))
    except Exception as err:
        print(f"ERROR: can not access minio service! \n{err}")
        pass
    
    # print(f"cat {tmp_path(f'hypoDD_{i:03d}.pha')} > hypoDD.pha")
    os.system(f"cat {tmp_path(f'hypoDD_{i:03d}.pha')} > hypoDD.pha")
    # os.system(f"cat {station_csv} > stations.csv")
    # os.system("python convert_stations.py")
    os.system(f"cat {station_dat} > stations_hypoDD.dat")

    PH2DT_CMD = f"HYPODD/src/ph2dt/ph2dt ph2dt.inp"
    if os.system(PH2DT_CMD) != 0:
        raise ("{PH2DT_CMD}" + " failed!")

    os.system(f"cat dt.ct > {ct_file}")
    os.system(f"cat event.sel > {hypodd_event}")
    os.system(f"mv dt.ct dt_{i:03d}.ct")
    os.system(f"mv event.dat event_{i:03d}.dat")
    os.system(f"mv event.sel event_{i:03d}.sel")
    os.system("rm -f hypoDD.reloc.*")

    try:
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/dt_{i:03d}.ct",
            f"dt_{i:03d}.ct",
        )
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/event_{i:03d}.dat",
            f"event_{i:03d}.dat",
        )
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/event_{i:03d}.sel",
            f"event_{i:03d}.sel",
        )
    except Exception as err:
        print(f"ERROR: can not access minio service! \n{err}")
        pass
        

    return f"dt_{i:03d}.ct"


In [845]:
if run_local:
    for i in num_split:
        ph2dt(i, root_dir("config.json"), root_dir("stations_hypoDD.dat"), root_dir("dt.ct"), root_dir("event.sel"))

In [846]:
ph2dt_op = comp.func_to_container_op(
    ph2dt,
    base_image='zhuwq0/hypodd-api:1.0'
)

In [847]:
def hypodd_ct(
    i: int,
    config_json: InputPath("json"),
    ct_file: InputPath(str),
    event: InputPath(str),
    station: InputPath(str),
    inp_file: str = "hypoDD_ct.inp",
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
):
    import json
    from datetime import datetime
    import os
    from minio import Minio

    with open(config_json, "r") as fp:
        config = json.load(fp)
    tmp_path = lambda x: os.path.join("/tmp/", x)

    os.system(f"cat {ct_file} > dt.ct")
    os.system(f"cat {event} > event.sel")
    os.system(f"cat {station} > stations_hypoDD.dat")

    HYPODD_CMD = f"HYPODD/src/hypoDD/hypoDD {inp_file}"
    if os.system(HYPODD_CMD) != 0:
        raise ("{HYPODD_CMD}" + " failed!")
    os.system(f"cat hypoDD.reloc > {tmp_path(f'hypoDD_ct_{i:03d}.reloc')}")

    try:
        minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)
        minioClient.fput_object(
            bucket_name,
            f"{config['region']}/hypoDD_ct_{i:03d}.reloc",
            tmp_path(f"hypoDD_ct_{i:03d}.reloc"),
        )
    except Exception as err:
        print(f"ERROR: can not access minio service! \n{err}")
        pass


In [848]:
if run_local:
    hypodd_ct(0, root_dir("config.json"), root_dir("dt.ct"), root_dir("event.sel"), root_dir("stations_hypoDD.dat"))

In [849]:
hypodd_ct_op = comp.func_to_container_op(
    hypodd_ct,
    base_image='zhuwq0/hypodd-api:1.0'
)

In [850]:
if run_local:
    import pandas as pd
    import matplotlib.pyplot as plt
    from datetime import datetime
    catalog_hypoDD = pd.read_csv(f"/tmp/hypoDD_ct_{0:03d}.reloc", sep="\s+", names=["ID", "LAT", "LON", "DEPTH", "X", "Y", "Z", "EX", "EY", "EZ", "YR", "MO", "DY", "HR", "MI", "SC", "MAG", "NCCP", "NCCS", "NCTP",
    "NCTS", "RCC", "RCT", "CID"])
    catalog_hypoDD["time"] = catalog_hypoDD.apply(lambda x: f'{x["YR"]:04.0f}-{x["MO"]:02.0f}-{x["DY"]:02.0f}T{x["HR"]:02.0f}:{x["MI"]:02.0f}:{min(x["SC"], 59.999):05.3f}', axis=1)
    catalog_hypoDD["time"] = catalog_hypoDD["time"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f"))
    plt.figure()
    plt.plot(catalog_hypoDD["LON"], catalog_hypoDD["LAT"], '.')


In [851]:
def cross_correlation(
    ct_file: InputPath(str),
    catalog_file: InputPath(str),
    picks_file: InputPath(str),
    cc_file: OutputPath(str),
):

    import pandas as pd
    from multiprocessing import Process, Manager
    from pymongo import MongoClient
    import numpy as np
    from tqdm import tqdm
    import time

    catalog = pd.read_csv(
        catalog_file,
        sep="\t",
        parse_dates=["time"],
        index_col=["event_idx"],
        dtype={"event_idx": str},
    )
    picks = pd.read_csv(picks_file, sep="\t", parse_dates=["timestamp"], dtype={"event_idx": str})
    picks["station"] = picks["id"].apply(lambda x: x.split(".")[1] + x.split(".")[3])
    picks = picks.set_index(["event_idx", "station", "type"])
    picks = picks.sort_index()

    pick_index = 100
    lo = pick_index - 50
    hi = pick_index + 100
    dt = 0.01

    ct_dict = Manager().dict()
    cc_dict = Manager().dict()
    with open(ct_file) as fct:
        meta = fct.readlines()
        for i, line in enumerate(meta):
            if line[0] == "#":
                if i > 0:
                    ct_dict[key] = value
                key = line
                value = []
                continue
            value.append(line)
        ct_dict[key] = value
    keys = sorted(list(ct_dict.keys()))

    def calc_cross_correlation(keys, ct_dict, cc_dict):
        username = "root"
        password = "quakeflow123"
        # client = MongoClient(f"mongodb://{username}:{password}@127.0.0.1:27017")
        client = MongoClient(f"mongodb://{username}:{password}@quakeflow-mongodb.default.svc.cluster.local:27017")
        db = client["quakeflow"]
        collection = db["waveform"]
        # normalize = lambda x: (x - np.mean(x, axis=0, keepdims=True)) / np.std(x, axis=0, keepdims=True)

        for key in keys:
            tmp = key.split()
            ID1, ID2 = tmp[1], tmp[2]
            key_cc = f"#    {ID1}    {ID2}    0.0\n"
            lines_cc = []
            for line in ct_dict[key]:
                tmp = line.split()
                STA, TT1, TT2, WGT, PHA = (
                    tmp[0],
                    tmp[1],
                    tmp[2],
                    tmp[3],
                    tmp[4],
                )  ##HypoDD format

                for i, row1 in picks.loc[(ID1, STA, PHA)].iterrows():

                    data1 = collection.find_one(
                        {"_id": f"{row1['id']}_{row1['timestamp'].isoformat(timespec='milliseconds')}_{PHA}"}
                    )

                    for j, row2 in picks.loc[(ID2, STA, PHA)].iterrows():

                        data2 = collection.find_one(
                            {"_id": f"{row2['id']}_{row2['timestamp'].isoformat(timespec='milliseconds')}_{PHA}"}
                        )

                        # if PHA == "P":  # Z
                        #     waveform1 = np.array(data1["waveform"])[lo:hi, -1:]
                        #     waveform2 = np.array(data2["waveform"])[lo:hi, -1:]
                        # elif PHA == "S":  # E, N
                        #     waveform1 = np.array(data1["waveform"])[lo:hi, :-1]
                        #     waveform2 = np.array(data2["waveform"])[lo:hi, :-1]
                        # else:
                        #     raise (Exception("PHA must be P or S"))
                        waveform1 = np.array(data1["waveform"])[lo:hi, :]
                        waveform2 = np.array(data2["waveform"])[lo:hi, :]

                        cc = np.zeros(waveform1.shape[0])
                        for k in range(waveform1.shape[1]):
                            cc += np.correlate(waveform1[:, k], waveform2[:, k], mode="same")
                        norm = np.sqrt(np.sum(waveform1**2) * np.sum(waveform2**2))
                        if norm == 0:
                            continue
                        else:
                            cc /= norm
                        shift = (np.argmax(np.abs(cc)) - waveform1.shape[0] // 2) * dt + float(TT1) - float(TT2)
                        coeff = np.max(np.abs(cc))

                        if not np.isnan(coeff):
                            lines_cc.append(f"{STA:<7s}    {shift:.5f}    {coeff:.3f}    {PHA}\n")

                cc_dict[key_cc] = lines_cc

        return 0

    t0 = time.time()
    processes = []
    num_process = 16
    for i in range(num_process):
        p = Process(target=calc_cross_correlation, args=(keys[i::num_process], ct_dict, cc_dict))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    print(f"{num_process} process: time = {time.time()-t0:.1f}")

    with open(cc_file, "w") as fcc:
        for key in cc_dict:
            fcc.write(key)
            for line in cc_dict[key]:
                fcc.write(line)


In [852]:
cc_op = comp.func_to_container_op(
    cross_correlation,
    base_image='python:3.8',
    packages_to_install=["pandas", "tqdm", "minio", "pymongo"],
)

In [853]:
def hypodd_cc(
    i: int,
    config_json: InputPath("json"),
    ct_file: InputPath(str),
    cc_file: InputPath(str),
    event: InputPath(str),
    station: InputPath(str),
    inp_file: str = "hypoDD_cc.inp",
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
):
    import json
    import os
    from minio import Minio

    with open(config_json, "r") as fp:
        config = json.load(fp)

    minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)

    os.system(f"cat {ct_file} > dt.ct")
    os.system(f"cat {cc_file} > dt.cc")
    os.system(f"cat {event} > event.sel")
    os.system(f"cat {station} > stations_hypoDD.dat ")

    HYPODD_CMD = f"HYPODD/src/hypoDD/hypoDD {inp_file}"
    if os.system(HYPODD_CMD) != 0:
        raise ("{HYPODD_CMD}" + " failed!")
    os.system(f"mv hypoDD.reloc hypoDD_cc_{i:03d}.reloc")

    minioClient.fput_object(
        bucket_name,
        f"{config['region']}/hypoDD_cc_{i:03d}.reloc",
        f"hypoDD_cc_{i:03d}.reloc",
    )


In [854]:
hypodd_cc_op = comp.func_to_container_op(
    hypodd_cc,
    base_image='zhuwq0/hypodd-api:1.0'
)

In [855]:
def merge_hypodd(
    config_json: InputPath("json"),
    catalog_txt: OutputPath(str),
    bucket_name: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = True,
):
    import json
    from glob import glob
    import os
    from minio import Minio

    minioClient = Minio(s3_url, access_key='minio', secret_key='minio123', secure=secure)

    with open(config_json, "r") as fp:
        config = json.load(fp)

    objects = minioClient.list_objects(bucket_name, prefix=f"{config['region']}/hypoDD_", recursive=True)

    tmp_path = lambda x: os.path.join("/tmp/", x)
    for obj in objects:
        print(obj._object_name)
        minioClient.fget_object(bucket_name, obj._object_name, tmp_path(obj._object_name.split("/")[-1]))

    hypoDD_ct_catalogs = sorted(glob(tmp_path("hypoDD_ct_*.reloc")))
    print(f"cat {' '.join(hypoDD_ct_catalogs)} > {tmp_path('hypoDD_ct_catalog.txt')}")
    os.system(f"cat {' '.join(hypoDD_ct_catalogs)} > {tmp_path('hypoDD_ct_catalog.txt')}")
    minioClient.fput_object(
        bucket_name, f"{config['region']}/hypoDD_ct_catalog.txt", tmp_path("hypoDD_ct_catalog.txt")
    )
    os.system(f"mv {tmp_path('hypoDD_ct_catalog.txt')} {catalog_txt}")

    hypoDD_cc_catalogs = sorted(glob(tmp_path("hypoDD_cc_*.reloc")))
    print(f"cat {' '.join(hypoDD_cc_catalogs)} > {tmp_path('hypoDD_cc_catalog.txt')}")
    os.system(f"cat {' '.join(hypoDD_cc_catalogs)} > {tmp_path('hypoDD_cc_catalog.txt')}")
    minioClient.fput_object(
        bucket_name, f"{config['region']}/hypoDD_cc_catalog.txt", tmp_path("hypoDD_cc_catalog.txt")
    )
    os.system(f"mv {tmp_path('hypoDD_cc_catalog.txt')} {catalog_txt}")

In [856]:
merge_hypodd_op = comp.func_to_container_op(
    merge_hypodd,
    base_image='python:3.8',
    packages_to_install=["pandas", "tqdm", "minio"],
)

In [857]:
@dsl.pipeline(name='QuakeFlow', description='')
def quakeflow_pipeline(
    data_path: str = "/tmp/",
    num_parallel=0,
    bucket_catalog: str = "catalogs",
    s3_url: str = "minio-service:9000",
    secure: bool = False,
):

    config = config_op(num_parallel)

    events = download_events_op(config.outputs["config_json"]).set_display_name('Download Events')

    stations = download_stations_op(config.outputs["config_json"]).set_display_name('Download Stations')

    with kfp.dsl.ParallelFor(config.outputs["output"]) as i:

        vop_ = dsl.VolumeOp(
            name=f"Create volume",
            resource_name=f"data-volume-{str(i)}",
            size="50Gi",
            modes=dsl.VOLUME_MODE_RWO,
        ).set_retry(3)

        download_op_ = (
            download_waveform_op(
                i,
                config.outputs["index_json"],
                config.outputs["config_json"],
                config.outputs["datetime_json"],
                stations.outputs["station_pkl"],
                data_path=data_path,
                bucket_name=f"waveforms",
                s3_url=s3_url,
                secure=secure,
            )
            .add_pvolumes({data_path: vop_.volume})
            .set_cpu_request("800m")
            .set_retry(3)
            .set_display_name('Download Waveforms')
        )
        download_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"

        phasenet_op_ = (
            phasenet_op(
                download_op_.outputs["Output"],
                download_op_.outputs["fname_csv"],
                stations.outputs["station_csv"],
            )
            .add_pvolumes({data_path: download_op_.pvolume})
            .set_memory_request("9G")
            .set_retry(3)
            .set_display_name('PhaseNet Picking')
        )
        phasenet_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
        phasenet_op_.set_image_pull_policy("Always")

        gamma_op_ = (
            gamma_op(
                i,
                config.outputs["index_json"],
                config.outputs["config_json"],
                phasenet_op_.outputs["picks"],
                stations.outputs["station_csv"],
                bucket_name=f"catalogs",
                s3_url=s3_url,
                secure=secure,
            )
            .set_cpu_request("800m")
            .set_retry(3)
            .set_display_name('GaMMA Association')
        )
        gamma_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"

    merge_op_ = (
        merge_op(
            config.outputs["config_json"],
            bucket_name=f"catalogs",
            s3_url=s3_url,
            secure=secure,
        )
        .after(gamma_op_)
        .set_display_name('Merge Catalog')
    )
    merge_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"

    convert_station_op_ = convert_station_op(station_csv=stations.outputs["station_csv"])
    split_hypodd_op_ = (
        split_hypodd_op(
            config.outputs["config_json"],
            picks_csv=merge_op_.outputs["picks_csv"],
            catalog_csv=merge_op_.outputs["catalog_csv"],
            bucket_name="catalogs",
            s3_url=s3_url,
            secure=secure,
        )
        .after(merge_op_)
        .set_display_name('Split Catalog')
    )
    split_hypodd_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
    split_hypodd_op_.set_image_pull_policy("Always")

    with kfp.dsl.ParallelFor(split_hypodd_op_.outputs["output"]) as i:

        ph2dt_op_ = ph2dt_op(
            i,
            config_json=config.outputs["config_json"],
            station_dat=convert_station_op_.outputs["hypodd_station"],
            bucket_name="catalogs",
            s3_url=s3_url,
            secure=secure,
        ).set_display_name('HypoDD PH2DT')
        ph2dt_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
        ph2dt_op_.set_image_pull_policy("Always")

        cc_op_ = cc_op(
            ct=ph2dt_op_.outputs["ct"],
            picks=merge_op_.outputs["picks_csv"],
            catalog=merge_op_.outputs["catalog_csv"],
        ).set_display_name('Cross Correlation')
        cc_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
        cc_op_.set_image_pull_policy("Always")

        hypodd_ct_op_ = hypodd_ct_op(
            i,
            config_json=config.outputs["config_json"],
            ct=ph2dt_op_.outputs["ct"],
            event=ph2dt_op_.outputs["hypodd_event"],
            station=convert_station_op_.outputs["hypodd_station"],
            bucket_name="catalogs",
            s3_url=s3_url,
            secure=secure,
        ).set_display_name('HypoDD')
        hypodd_ct_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
        hypodd_ct_op_.set_image_pull_policy("Always")

        hypodd_cc_op_ = hypodd_cc_op(
            i,
            config_json=config.outputs["config_json"],
            ct=ph2dt_op_.outputs["ct"],
            cc=cc_op_.outputs["cc"],
            event=ph2dt_op_.outputs["hypodd_event"],
            station=convert_station_op_.outputs["hypodd_station"],
            bucket_name="catalogs",
            s3_url=s3_url,
            secure=secure,
        ).set_display_name('HypoDD + CC')
        hypodd_cc_op_.execution_options.caching_strategy.max_cache_staleness = "P30D"
        hypodd_cc_op_.set_image_pull_policy("Always")

    merge_hypodd_op_ = (
        merge_hypodd_op(
            config_json=config.outputs["config_json"], bucket_name=f"catalogs", s3_url=s3_url, secure=secure
        )
        .after(hypodd_ct_op_)
        .after(hypodd_cc_op_)
        .set_display_name('Merge Catalog')
    )
    merge_hypodd_op_.execution_options.caching_strategy.max_cache_staleness = "P0D"
    merge_hypodd_op_.set_image_pull_policy("Always")

    # vop_.delete().after(merge_hypodd_op_)


In [858]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/weiqiang/.dotbot/cloud/quakeflow_zhuwq.json"
experiment_name = 'QuakeFlow'
pipeline_func = quakeflow_pipeline
run_name = pipeline_func.__name__ + '_run'

arguments = {
    "data_path": "/tmp",
    "num_parallel": 0,
    "bucket_catalog": "catalogs",
    "s3_url": "minio-service:9000",
    "secure": False,
}

if not run_local:
    pipeline_conf = kfp.dsl.PipelineConf()
    pipeline_conf.set_image_pull_policy("Always")
    pipeline_conf.ttl_seconds_after_finished = 60 * 10
    # client = kfp.Client(host="2dbc4e1ef495773d-dot-us-west1.pipelines.googleusercontent.com")
    client = kfp.Client(host="http://localhost:8080")
    kfp.compiler.Compiler().compile(pipeline_func, '{}.zip'.format(experiment_name), pipeline_conf=pipeline_conf)
    results = client.create_run_from_pipeline_func(
        pipeline_func, experiment_name=experiment_name, run_name=run_name, arguments=arguments,
    )
