In [3]:
import numpy as np
import pandas as pd
from obspy import read
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
import dask.dataframe as dd

## Reading and combining all data in a single dataset 

In [3]:
cat_directory = '../data/lunar/training/catalogs/'
cat_file = cat_directory + 'apollo12_catalog_GradeA_final.csv'
cat = pd.read_csv(cat_file)
cat

Unnamed: 0,filename,time_abs(%Y-%m-%dT%H:%M:%S.%f),time_rel(sec),evid,mq_type
0,xa.s12.00.mhz.1970-01-19HR00_evid00002,1970-01-19T20:25:00.000000,73500.0,evid00002,impact_mq
1,xa.s12.00.mhz.1970-03-25HR00_evid00003,1970-03-25T03:32:00.000000,12720.0,evid00003,impact_mq
2,xa.s12.00.mhz.1970-03-26HR00_evid00004,1970-03-26T20:17:00.000000,73020.0,evid00004,impact_mq
3,xa.s12.00.mhz.1970-04-25HR00_evid00006,1970-04-25T01:14:00.000000,4440.0,evid00006,impact_mq
4,xa.s12.00.mhz.1970-04-26HR00_evid00007,1970-04-26T14:29:00.000000,52140.0,evid00007,deep_mq
...,...,...,...,...,...
71,xa.s12.00.mhz.1974-10-14HR00_evid00156,1974-10-14T17:43:00.000000,63780.0,evid00156,impact_mq
72,xa.s12.00.mhz.1975-04-12HR00_evid00191,1975-04-12T18:15:00.000000,65700.0,evid00191,impact_mq
73,xa.s12.00.mhz.1975-05-04HR00_evid00192,1975-05-04T10:05:00.000000,36300.0,evid00192,impact_mq
74,xa.s12.00.mhz.1975-06-24HR00_evid00196,1975-06-24T16:03:00.000000,57780.0,evid00196,impact_mq


Get filepath of all existing csv files.

In [6]:
mseed_folder_path = r'..\data\lunar\training\data\S12_GradeA'
lunar_data_names = cat['filename'].values

lunar_data_file_path = [os.path.join(mseed_folder_path, f'{file_name}.mseed') for file_name in lunar_data_names]
lunar_data_file_path = [file_path for file_path in lunar_data_file_path if os.path.isfile(file_path)]

# Define reading and preprocessing functions

In [18]:
def read_mseed(file_path):
  """
  Reads an mseed file and returns a Dask DataFrame with data, actual datetime values 
  for time, and sampling rate.

  Args:
      file_path (str): Path to the mseed file.

  Returns:
      dask.dataframe.DataFrame: Dask DataFrame with columns 'data', 'time', and 'sampling_rate'.
  """

  # Read the mseed file
  st = read(file_path)
  tr = st[0]  # Assuming single trace in the mseed file

  # Extract data and sampling rate
  data = tr.data
  sampling_rate = tr.stats.sampling_rate

  # Get actual datetime values for time
  times = []
  for i in range(len(data)):
    time = tr.stats.starttime + i / sampling_rate
    times.append(time.datetime)  # Convert UTCDateTime to datetime object

  # Create a Dask DataFrame
  df = dd.from_pandas(pd.DataFrame({'time': times, 'data': data, 'sampling_rate': sampling_rate}), npartitions=1) 

  return df


def process_lunar_data_lazy(lunar_data_file_path, output_file):
  """
  Lazily reads multiple mseed files, concatenates them into a single Dask DataFrame, 
  removes duplicate entries based on time, and saves the result to a Parquet file.

  Args:
      lunar_data_file_path (list): List of paths to the mseed files.
      output_file (str): Path to the output Parquet file.
  """

  # Create a list to hold the lazy-loaded DataFrames
  lazy_dataframes = []

  for file_path in lunar_data_file_path:
    ddf = read_mseed(file_path)
    lazy_dataframes.append(ddf)

  # Concatenate all Dask DataFrames lazily
  combined_ddf = dd.concat(lazy_dataframes, interleave_partitions=True)

  # Remove duplicates based on the 'time' column
  combined_ddf = combined_ddf.drop_duplicates(subset=['time'])

  # Sort the combined Dask DataFrame by 'time'
  combined_ddf = combined_ddf.sort_values('time')

  # Save the combined Dask DataFrame to a Parquet file
  combined_ddf.to_parquet(output_file, engine='pyarrow', write_index=False)

In [4]:
path_to_data_folder = r'..\data\lunar\training\data'
path_to_save_combined_csv = os.path.join(path_to_data_folder, "combined_data.parquet")

In [20]:
process_lunar_data_lazy(lunar_data_file_path, path_to_save_combined_csv)

In [5]:
ddf = dd.read_parquet(path_to_save_combined_csv, engine='pyarrow')

In [9]:
# Show the first 10 rows
ddf.head(10)

Unnamed: 0,time,data,sampling_rate
0,1970-01-19 00:00:00.665000,-6.153279e-14,6.625
1,1970-01-19 00:00:00.815943,-7.701288e-14,6.625
2,1970-01-19 00:00:00.966887,-8.396187e-14,6.625
3,1970-01-19 00:00:01.117830,-8.096155e-14,6.625
4,1970-01-19 00:00:01.268774,-7.097599e-14,6.625
5,1970-01-19 00:00:01.419717,-5.882623e-14,6.625
6,1970-01-19 00:00:01.570660,-4.801144e-14,6.625
7,1970-01-19 00:00:01.721604,-3.909115e-14,6.625
8,1970-01-19 00:00:01.872547,-3.042487e-14,6.625
9,1970-01-19 00:00:02.023491,-2.029018e-14,6.625


In [10]:
# Show the first 10 rows
ddf.tail(10)

Unnamed: 0,time,data,sampling_rate
8381201,1975-06-27 00:00:00.089170,7.402473e-16,6.625
8381202,1975-06-27 00:00:00.240113,7.598239e-16,6.625
8381203,1975-06-27 00:00:00.391057,7.794005e-16,6.625
8381204,1975-06-27 00:00:00.542000,7.98977e-16,6.625
8381205,1975-06-27 00:00:00.692943,3.466912e-17,6.625
8381206,1975-06-27 00:00:00.843887,-3.982647e-16,6.625
8381207,1975-06-27 00:00:00.994830,-5.580877e-16,6.625
8381208,1975-06-27 00:00:01.145774,-6.563002e-16,6.625
8381209,1975-06-27 00:00:01.296717,-3.281501e-16,6.625
8381210,1975-06-27 00:00:01.447660,0.0,6.625
