# Farm DAS correlation

In [None]:
import sys
sys.path.append(".")
sys.path.append("noisepy4das_repo/NoisePy4DAS-SeaDAS/src")
sys.path.append("noisepy4das_repo/NoisePy4DAS-SeaDAS/DASstore")

import os
import h5py
import math
import time
import DAS_module
import numpy as np
import pandas as pd
import matplotlib

from tqdm import tqdm
from obspy import UTCDateTime
from datetime import datetime
from datetime import timedelta
from functools import partial
from scipy.signal import butter
from scipy.signal import detrend
from scipy.signal import decimate
from scipy.signal import filtfilt
from scipy.signal import spectrogram
from dasstore.zarr import Client
from multiprocessing import Pool
from matplotlib import pyplot as plt
from das_util import read_decimate, get_tstamp, calc_NFFT

%matplotlib inline
matplotlib.rcParams.update({'font.size': 16})
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

# QC on metadata

### Sort files by time

In [None]:
data_dir = '/1-fnp/petasaur/p-wd05/harper_plots'
file_list = np.array(os.listdir(data_dir))
acqu_time = np.array([get_tstamp(i) for i in file_list])

new_index = np.argsort(np.array(acqu_time)-acqu_time[0])

file_list = file_list[new_index]
acqu_time = acqu_time[new_index]
file_path = [os.path.join(data_dir,i) for i in file_list]
print(file_list[:5])
print('Total number of files:', len(file_path))

### time and space sampling

In [None]:
# %% two quiet time period are manually found
list1 = np.arange(500,600)
list2 = np.arange(1900,2000)
list_all = np.concatenate((list1, list2))

# %% reasonable acquisition time period
list_all = np.arange(28,2468)

num_file = len(list_all)
gauge_length_all = np.zeros(num_file, dtype=np.float64)
delta_space_all = np.zeros(num_file, dtype=np.float64)
num_channel_all = np.zeros(num_file, dtype=np.float64)
sample_rate_all = np.zeros(num_file, dtype=np.float64)
num_sample_all = np.zeros(num_file, dtype=np.float64)

for i,j in enumerate(list_all):
    with h5py.File(file_path[j],'r') as f:      
        gauge_length_all[i] = f['Acquisition'].attrs['GaugeLength']
        delta_space_all[i] = f['Acquisition'].attrs['SpatialSamplingInterval']
        sample_rate_all[i]  = f['Acquisition']['Raw[0]'].attrs['OutputDataRate']
        num_channel_all[i] = f['Acquisition']['Raw[0]'].attrs['NumberOfLoci']
        num_sample_all[i]  = len(f['Acquisition']['Raw[0]']['RawDataTime'][:])
        
# %% exclude files of which length is not 120,000
ind_good = np.where(num_sample_all == 120000)[0]
gauge_length_all = gauge_length_all[ind_good]
delta_space_all = delta_space_all[ind_good]
sample_rate_all = sample_rate_all[ind_good]
num_channel_all = num_channel_all[ind_good]
num_sample_all = num_sample_all[ind_good]
list_all = list_all[ind_good]

print(f'good acqusition for {len(ind_good)} minutes')

In [None]:
# %% See if the acquisition time is continuous
file_list = file_list[list_all]
acqu_time = acqu_time[list_all]
file_path = [os.path.join(data_dir,i) for i in file_list]
plt.close('all')
fig, ax = plt.subplots(1, 1, figsize=(13, 2.5), constrained_layout=True)
ax.scatter(list_all, acqu_time.astype('datetime64[m]'), marker='o', s=0.1, edgecolors='k')

## 0-36 hours

In [None]:
start_ch, end_ch = 44, 94            # channel range
ch_id = 33                           # choose a channel to visualize
start_minutes = 0                    # starting file indice for reading
num_minutes = 2160                    # number of semi-continuous 1-min files to merge
num_seconds = int(num_minutes * 60)  # total duration (s) of merged time series
dsamp_factor = 20                    # downsample rate when reading raw time series
sample_rate = int(2000 / dsamp_factor)    # final sample rate after downsampling
shift_min = 7                        # shift in minutes of the first tick when ploting
shift_sec = int(shift_min * 60)      # shift in seconds
file_inc = 360                       # plot ticks for every {file_inc} 1-min files 
time_inc = int(file_inc * 60)        # tick interval in seconds

### The next 3 cells read the data, slowly

In [None]:
# %% read the raw data file
# %% could be the most time-consuming
# %% Do Not run this cell after the FIRST time
# %% instead, run the next cell 
since = time.time()

# %% multi-process to read and decimate lots of files 
num_proc = 20   # number of threads (too large number kills the memory) 
partial_func = partial(read_decimate, dsamp_factor=dsamp_factor, start_ch=start_ch, end_ch=end_ch)
with Pool(processes=num_proc) as pool:   # pool is closed automatically and join as a list
    print("# threads: ", num_proc)
    full_time = pool.map(partial_func, file_path[start_minutes:start_minutes+num_minutes])

# %% concatenate the list elements in time
full_time_data = np.concatenate(full_time, axis=1)

print(f'time used: {time.time()- since:.1f}')
print(f'final shape: {full_time_data.shape}')
print(f'sample rate: {sample_rate:.0f}')

# %% save it to HDF5 for future use
datah5 = '/fd1/QibinShi_data/England_farm/farmDAS_harper_0_24hr.hdf5'
with h5py.File(datah5, 'w') as f:
    f.create_dataset("data", data=full_time_data)
    f.create_dataset("timestamp", data=time_stamp)
    f.create_dataset("dt", data=sample_rate)

In [None]:
# %% Run this cell only after the FIRST time
# %% read from the saved HDF5
datah5 = '/fd1/QibinShi_data/England_farm/farmDAS_harper_0_24hr.hdf5'
with h5py.File(datah5, 'r') as f:
    full_time_data = f["data"][:]
    sample_rate = f["dt"][()]
    time_stamp = f["timestamp"][:]
acqu_time = np.array([UTCDateTime(i) for i in time_stamp])
nsec = int(full_time_data.shape[1]/sample_rate) # total time of merged time series

In [None]:
# %% plot data in time-space
fig, ax = plt.subplots(2, 1, figsize=(16, 4), constrained_layout=True)
max_amp = np.percentile(np.fabs(full_time_data), q=80)
cmap=plt.cm.get_cmap('RdBu')

x=np.arange(full_time_data.shape[1])
y=np.arange(full_time_data.shape[0])
ax[0].pcolormesh(x, y, full_time_data, shading='auto', vmin=-max_amp, vmax=max_amp, cmap=cmap)
ax[0].set_xticks(np.linspace(0,nsec*sample_rate,7))
ax[0].set_xticklabels(acqu_time[(np.linspace(0,nsec/60,7)).astype(int)].astype('datetime64[m]'))
ax[0].set_yticks(np.arange(0, 50, 10))
ax[0].set_yticklabels(np.round(np.arange(0, 50, 10)*3.1904762684013206, decimals=1))
ax[0].set_ylabel('Distance (m)'); ax[0].set_title('Full continuous data')

# Cross correlation

## Visualize an example data

In [None]:
start_ch, end_ch = 44, 94
with h5py.File(file_path[100],'r') as f:
    gauge_len = f['Acquisition'].attrs['GaugeLength']
    delta_space = f['Acquisition'].attrs['SpatialSamplingInterval']
    sample_rate  = f['Acquisition']['Raw[0]'].attrs['OutputDataRate']
    num_channel = f['Acquisition']['Raw[0]'].attrs['NumberOfLoci']
    num_sample  = len(f['Acquisition']['Raw[0]']['RawDataTime'][:])
    minute_data = f['Acquisition']['Raw[0]']['RawData'][:num_sample, start_ch:end_ch].T

    delta_time = 1.0 / sample_rate
        

print(f'1-min data with shape: {minute_data.shape}')
print('-'*12)
print(f'Gauge length (m): {gauge_len}')
print(f'Channel spacing (m): {delta_space}')
print(f'Num channels: {num_channel}')
print(f'Total cable length (m): {delta_space * num_channel}')
print('-'*12)
print(f'Sampling rate (Hz): {delta_time}')
print(f'Num samples: {num_sample}')
print(f'Total duration {delta_time * num_sample:.2f}')

max_amp = np.median(np.fabs(minute_data))
max_amp =30
plt.figure(figsize = (10, 5), dpi = 100)
plt.imshow(minute_data, aspect = 'auto', cmap = 'RdBu', vmax = max_amp, vmin = -max_amp, origin='lower')
plt.ylabel("Distance (m)", fontsize = 20)
plt.xlabel("Time (s)", fontsize = 20)
plt.xticks(np.linspace(0, minute_data.shape[1], 7), 
           [i/sample_rate for i in np.linspace(0, minute_data.shape[1], 7)])
plt.yticks(np.linspace(0, minute_data.shape[0], 6), 
           [int(i*delta_space) for i in np.linspace(0, minute_data.shape[0], 6)])
plt.colorbar()

## Configure parameters for NoisePy processing 

In [None]:
samp_freq = 200                # targeted sampling rate
freqmin   = 5                  # pre filtering frequency bandwidth
freqmax   = 90                 # note this cannot exceed Nquist freq

freq_norm   = 'phase_only'     # 'no' for no whitening, or 'rma' for running-mean average, 'phase_only' for sign-bit normalization in freq domain.
time_norm   = 'one_bit'        # 'no' for no normalization, or 'rma', 'one_bit' for normalization in time domain
cc_method   = 'xcorr'          # 'xcorr' for pure cross correlation, 'deconv' for deconvolution; FOR "COHERENCY" PLEASE set freq_norm to "rma", time_norm to "no" and cc_method to "xcorr"
smooth_N    = 50               # moving window length for time domain normalization if selected (points)
smoothspect_N  = 50            # moving window length to smooth spectrum amplitude (points)
maxlag      = 8                # lags of cross-correlation to save (sec)

# criteria for data selection
max_over_std = 10 *9              # threahold to remove window of bad signals: set it to 10*9 if prefer not to remove them

cc_len = delta_time * num_sample  # correlate length in second
step   = delta_time * num_sample  # stepping length in second

# start and end channel index for the sub-array
cha1, cha2 = start_ch, end_ch

cha_list = np.array(range(cha1, cha2)) 
nsta = len(cha_list)
n_pair = int((nsta+1)*nsta/2)
n_lag = maxlag * samp_freq * 2 + 1

prepro_para = {'freqmin':freqmin,
               'freqmax':freqmax,
               'sps':sample_rate,
               'npts_chunk':cc_len*sample_rate,
               'nsta':nsta,
               'cha_list':cha_list,
               'samp_freq':samp_freq,
               'freq_norm':freq_norm,
               'time_norm':time_norm,
               'cc_method':cc_method,
               'smooth_N':smooth_N,
               'smoothspect_N':smoothspect_N,
               'maxlag':maxlag,
               'max_over_std':max_over_std}

## NoisePy pre-processing 

In [None]:
trace_stdS, dataS = DAS_module.preprocess_raw_make_stat(minute_data.T, prepro_para)
max_amp = np.median(np.fabs(dataS)) / 5
plt.figure(figsize = (10, 5), dpi = 100)
plt.imshow(dataS, aspect = 'auto', 
           cmap = 'RdBu', vmax = max_amp, vmin = -max_amp, origin='lower')
plt.ylabel("Channel number", fontsize = 20)
plt.xlabel("Sample", fontsize = 20)
plt.colorbar()

In [None]:
white_spect = DAS_module.noise_processing(dataS, prepro_para)
Nfft = white_spect.shape[1]; Nfft2 = Nfft // 2
data = white_spect[:, :Nfft2]
del dataS, white_spect

print(data.shape, data.dtype)
ind = np.where((trace_stdS < prepro_para['max_over_std']) &
                        (trace_stdS > 0) &
                (np.isnan(trace_stdS) == 0))[0]
if not len(ind):
    raise ValueError('the max_over_std criteria is too high which results in no data')
sta = cha_list[ind]
white_spect = data[ind]

print(white_spect.shape, white_spect.shape)

# Channel-wise correlation and stacking over time

In [None]:
print(len(file_path))
start_file = np.arange(0, len(list_all)-60, 5)
print(start_file)

In [None]:
for istack in start_file:
    list_hour = np.arange(istack, istack + 10)
    
    pbar = tqdm(list_hour)
    corr_full = np.zeros([n_lag, n_pair], dtype = np.float32)
    stack_full = np.zeros([1, n_pair], dtype = np.int32)

    for imin in pbar:
        t0 = time.time()
        pbar.set_description(f"Processing {imin} th file")
        with h5py.File(file_path[imin],'r') as f:
            gauge_len = f['Acquisition'].attrs['GaugeLength']
            delta_space = f['Acquisition'].attrs['SpatialSamplingInterval']
            sample_rate  = f['Acquisition']['Raw[0]'].attrs['OutputDataRate']
            num_sample  = len(f['Acquisition']['Raw[0]']['RawDataTime'][:])
            minute_data = f['Acquisition']['Raw[0]']['RawData'][:num_sample, start_ch:end_ch]


        # perform pre-processing
        trace_stdS, dataS = DAS_module.preprocess_raw_make_stat(minute_data, prepro_para)

        # do normalization if needed
        white_spect = DAS_module.noise_processing(dataS, prepro_para)
        Nfft = white_spect.shape[1]; Nfft2 = Nfft // 2
        data = white_spect[:, :Nfft2]
        del dataS, white_spect

        ind = np.where((trace_stdS < prepro_para['max_over_std']) &
                                (trace_stdS > 0) &
                        (np.isnan(trace_stdS) == 0))[0]
        if not len(ind):
            raise ValueError('the max_over_std criteria is too high which results in no data')
        sta = cha_list[ind]
        white_spect = data[ind]

        # loop over all stations
        for iiS in range(len(sta)):
            # smooth the source spectrum
            sfft1 = DAS_module.smooth_source_spect(white_spect[iiS], prepro_para)

            # correlate one source with all receivers
            corr, tindx = DAS_module.correlate(sfft1, white_spect[iiS:], prepro_para, Nfft)
            print(f"source channel index: {sta[iiS]}, correlation shape: {corr.shape}")

            # update the receiver list
            tsta = sta[iiS:]
            receiver_lst = tsta[tindx]

            iS = int((cha2*2 - cha1 - sta[iiS] + 1) * (sta[iiS] - cha1) / 2)

            # stacking one minute
            corr_full[:, iS + receiver_lst - sta[iiS]] += corr.T
            stack_full[:, iS + receiver_lst - sta[iiS]] += 1

    corr_full /= stack_full
    
    plt.figure(figsize = (12, 5), dpi = 200)
#     max_amp = np.median(np.fabs(corr_full[:, :(cha2 - cha1)])) * 10
    max_amp = 0.004
    plt.imshow(corr_full[1200:2000, :(cha2 - cha1)].T, aspect = 'auto', cmap = 'RdBu', 
               vmax = max_amp, vmin = -max_amp, origin = 'lower')


    plt.yticks(np.linspace(cha1, cha2, 4) - cha1, 
                  [int(i*delta_space) for i in np.linspace(cha1, cha2, 4)], fontsize = 12)
    plt.xticks(np.arange(0, 800, 200), (np.arange(-400, 400, 200))/samp_freq, fontsize = 12)

    # linex=np.arange(400, 480, 10)
    # liney=((linex-400)/samp_freq*400) / delta_space
    # plt.plot(linex,liney,color='white', linestyle='dashed',linewidth=2)

    plt.axvline(x = 400, color = 'k')

    plt.ylabel("Distance (m)", fontsize = 16)
    plt.xlabel("Time lag (sec)", fontsize = 16)
    plt.title(str(acqu_time[istack]), fontsize = 20)
    bar = plt.colorbar(format = lambda x, pos: '{:}'.format(x*100))
    bar.set_label('Cross-correlation Coefficient ($\\times10^{-2}$)', fontsize = 15)

#     plt.savefig('/data/whd01/qibin_data/farm_hour_stack_png/'+str(istack).zfill(4)+ '.pdf', dpi=150, format='pdf', bbox_inches='tight')
#     plt.close()