In [2]:
from pathlib import Path
import re
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange, tqdm
from scipy import stats

In [41]:
class fast_processing_engine():
    """Class to take raw fast data from a given site and turn it into a standardized product."""
    
    def __init__(self, converted_dirs, metadata_fn, file_length, acq_freq, start_time, end_time, site_names, out_dir):
        """converted_dirs: list of directories containing raw converted TOA5 files, as strings. If multiple directories are provided, they will be combined into one output directory
        metadata_dir: string giving the filename of the metadata .xlsl file
        file_length: int, file length in minutes
        acq_freq: int, acquisition speed in Hz
        start_time, end_time: the start and end dates for the files to process in yyyy-mm-dd hh:mm format. Round DOWN to the nearest half-hour.
        site_names: list of site names to combine. Must be in the same order as converted_dir
        out_dir: str, output/processing directory."""
        
        header_sheets = {
                "BB-NF17": "NF 17m 10Hz Code",
                "BB-NF3": "NF 3m 10Hz Code"
        }
        
        final_headers = {
            "BB-NF17": ["TIMESTAMP", "Ux_CSAT3_NF17", "Uy_CSAT3_NF17", "Uz_CSAT3_NF17", "Ts_CSAT3_NF17", "Ux_CSAT3_NF7", "Uy_CSAT3_NF7", "Uz_CSAT3_NF7", "Ts_CSAT3_NF7", "rho_c_LI7500_NF17", "rho_v_LI7500_NF17", "DIAG_CSAT3_NF17", "DIAG_CSAT3_NF17", "DIAG_LI7500_NF17"],
            "BB-NF3": ["TIMESTAMP", "Ux_CSAT3B_NF3", "Uy_CSAT3B_NF3", "Uz_CSAT3B_NF3", "Ts_CSAT3B_NF3", "rho_c_LI7500_NF3", "rho_v_LI7500_NF3", "DIAG_CSAT3_NF3", "DIAG_LI7500_NF3"]
        }
        self.site_info = {site:{"fns":None, 
                                "file_tss":None, 
                                "desired_file_tss":None, 
                                "converted_path":Path(converted_dir), 
                                "n_files_converted":None,
                                "rawfile_metadata":None,
                                "header_sheet":header_sheets[site],
                                "header_metadat":None
                                "final_header":final_headers[site]
                               }
                         for site, converted_dir in zip(site_names, converted_dirs)}
        
        self.start_time = pd.to_datetime(start_time)
        self.end_time = pd.to_datetime(end_time)

        # convert acq freq to interval
        self.acq_freq = acq_freq
        self.acq_period = pd.Timedelta(f'{1000//self.acq_freq} ms')
    
        self.file_length = file_length
        self.n_records = self.file_length*self.acq_freq*60

        # convert files and directories to path objects and create an output directory
        self.out_path = Path(out_dir)
        self.metadata_path = Path(metadata_fn)
        
        if not self.out_path.exists():
            self.out_path.mkdir()
        return
    
    def get_timestamp_from_fn(self, fn):
        """given a fast file, this will extract the timestamp in its name"""
        file_id = Path(fn).name.split('Hz')[1]
        file_start_str = "".join(re.split("_|\.", file_id)[1:-1])
        file_start_ts = pd.to_datetime(file_start_str, format="%Y%m%d%H%M")
        return file_start_ts
    
    def get_fn_from_timestamp(self, file_start_ts, site):
        """given a timestamp, this will find the exact file name it's associated with"""
        file_start_str = (f'{file_start_ts.year:04d}_' + 
                          f'{file_start_ts.month:02d}_' + 
                          f'{file_start_ts.day:02d}_' + 
                          f'{file_start_ts.hour:02d}{file_start_ts.minute:02d}')

        # if the file exists
        for i, fn in enumerate(self.site_info[site]['fns']):
            if file_start_str in str(fn):
                return fn

        # if not
        return
    
    def find_fast_files(self):
        """find all the raw data files that the user wants to process and place them in the site info dict"""
        
        for site in self.site_info:
            # get the timestamps we want to see
            self.site_info[site]['desired_file_tss'] = pd.date_range(self.start_time, self.end_time, freq=f'{self.file_length} min')
            
            # retrieve the raw file names
            self.site_info[site]['fns'] = list(self.site_info[site]['converted_path'].glob("TOA5*.dat"))
            
            # get raw file timestamps
            file_tss = []
            for fn in self.site_info[site]['fns']:
                fts = self.get_timestamp_from_fn(fn)
                if (fts >= self.start_time and fts <= self.end_time):
                    file_tss.append(self.get_timestamp_from_fn(fn))
            
            # sort file names by timestamp
            file_tss = sorted(file_tss)
            self.site_info[site]['fns'] = [self.get_fn_from_timestamp(fts, site) for fts in file_tss]
            self.site_info[site]['file_tss'] = file_tss.copy()
            
            self.site_info[site]['n_files_converted'] = len(file_tss)
      
    def metadata_template(self):
        """create a blank template to store raw file metadata in"""
        
        for site in self.site_info:
            rawfile_metadata = pd.DataFrame(
                data=np.zeros((self.site_info[site]['n_files_converted'], 8), dtype=str),
                columns=['Encoding', 'Station_name', 'Datalogger_model', 'Datalogger_serial_number', 'Datalogger_OS_version', 'Datalogger_program_name', 'Datalogger_program_signature', 'Table_name']
            )
            rawfile_metadata['File_name'] = self.site_info[site]['fns']
            rawfile_metadata['TIMESTAMP'] = self.site_info[site]['file_tss']
            rawfile_metadata['out_fn'] = np.NAN
            rawfile_metadata.set_index('TIMESTAMP', inplace=True)

            self.site_info[site]['rawfile_metadata'] = rawfile_metadata.copy()
        
        return
    
    def summary_template(self, header):
        # initialize summary statistics array: [TIMESTAMP, RECORD, Ux_Max, Ux_Min, Ux_Std, ... rho_c_Max, ...]
        summary_header = []
        for colname in header[2:]:
            summary_header.append(colname + '_Max')
            summary_header.append(colname + '_Min')
            summary_header.append(colname + '_Std')
            summary_header.append(colname + '_Mean')
            summary_header.append(colname + '_NANPct')
        summary_data = np.empty((len(self.desired_file_tss), len(summary_header)))
        return summary_data, summary_header
        
    
    def get_fast_header_info(self):
        """each site has a specially formatted header associated with it"""
        
        for site in self.site_info:
            print(site)
            metadat = pd.read_excel(self.metadata_path, sheet_name = self.site_info[site]['header_sheet'])
            metadat['Date_yyyymmdd'] = pd.to_datetime(metadat['Date_yyyymmdd'], format='%Y%m%d')
            self.site_info[site]['header_metadat'] = metadat.copy()
        return
    
    def reorder_headers(self, site, date):
        """given a site name and a date, this will return the indices to rearrange the headers at. 
        
        For example, if the raw file has the header
        a, b, c, e, g, d, f
        
        and the final file should have the header
        
        a, b, c, d, e, f, g
        
        then this will return
        
        [0, 1, 2, 4, 6, 3, 5]"""
        
        if site == 'BB-NF17':
            
        
        
    
    def standardize_fast_files(self):
        """reads in raw fast files, and combines/standardizes them to be continuous. Re-writes the files. Also computes diagnostic/summary statistics on the data"""
        
        for site in site_info:
            # initialize summary data
            summary_data, summary_header = self.summary_template(self.site_info[site]['final_header'])

            # we'll be popping file names off of fns and file_tss, so we'll create temporary copies of them first
            fns_temp, file_tss_temp = self.site_info[site]['fns'].copy(), self.site_info[site]['file_tss'].copy()

            # start processing files by desired start timestamp:
            # for each file time window (usually every half-hour), find all the converted raw files that will fit into that time window.
            # then, if there are any holes in the timeseries defined by those raw files, fill the missing timestamps and then fill the missing data
            # with NANs. Write the final file to a csv and report file summary statistics.
            for idfts, dfts in enumerate(tqdm(self.site_info[site]['desired_file_tss'])):
                
                # generate the desired time index of the input file (start 100ms after file timestamp)
                desired_time_index = pd.date_range(dfts + self.acq_period, periods=self.n_records, freq=self.acq_period)
                dat = pd.DataFrame(desired_time_index, columns=['TIMESTAMP'])
                dat.set_index('TIMESTAMP', inplace=True)

                # find qualified files for this time interval
                next_file_ts = dfts + pd.Timedelta(f'{self.file_length} Min')
                next_fns, next_file_tss = [], []
                try:
                    while file_tss_temp[0] < next_file_ts:
                        next_fns.append(fns_temp.pop(0))
                        next_file_tss.append(file_tss_temp.pop(0))
                except IndexError as err:
                    pass

                # combine the found files
                # if no valid files are found, just make a null dataframe
                if next_fns == []:
                    rawdat = pd.DataFrame(np.full((1, len(self.site_info[site]['final_header'])), np.nan), 
                                          columns=self.site_info[site]['final_header'])
                    rawdat['TIMESTAMP'] = desired_time_index[0]
                    rawdat.set_index('TIMESTAMP', inplace=True)
                    
                # otherwise proceed as normal
                else:
                    for i, fn, ts in zip(range(len(next_fns)), next_fns, next_file_tss):
                        if i == 0:
                            rawdat = pd.read_csv(fn, sep=',', header=[0], skiprows=[0, 2, 3], na_values = ['NAN', '-4400906'])
                            rawdat['TIMESTAMP'] = pd.to_datetime(rawdat['TIMESTAMP'], format='%Y-%m-%d %H:%M:%S.%f')
                            rawdat.set_index('TIMESTAMP', inplace=True)
                        else:
                            rawdat_tmp = pd.read_csv(fn, sep=',', header=[0], skiprows=[0, 2, 3], na_values = ['NAN', '-4400906'])
                            rawdat_tmp['TIMESTAMP'] = pd.to_datetime(rawdat_tmp['TIMESTAMP'], format='%Y-%m-%d %H:%M:%S.%f')
                            rawdat_tmp.set_index('TIMESTAMP', inplace=True)
                            rawdat = pd.concat([rawdat, rawdat_tmp])
                            rawdat = rawdat.merge(rawdat_tmp, how='outer', left_index=True, right_index=True)

                        # get the metadata
                        dfts_str = re.sub('-| ', '_', str(dfts))
                        dfts_str = re.sub(':', '', dfts_str)[:-2]
                        desired_fn = out_path / f'{dfts_str}.csv'
                        with open(fn) as f: self.rawfile_metadata.loc[ts] = f.readline()[1:-2].split('","') + [fn, desired_fn]

                # merge raw files into complete array
                dat = dat.merge(rawdat, how='outer', left_index=True, right_index=True, sort=True)
                self.write_out(dfts, dat)


                # write summary stats
                for icolname, colname in enumerate(dat.columns[2:]):
                    summary_row = np.array([
                        np.nanmax(dat[colname]),
                        np.nanmin(dat[colname]),
                        np.nanstd(dat[colname]),
                        np.nanmean(dat[colname]),
                        stats.skew(dat[colname], nan_policy='omit'),
                        stats.kurtosis(dat[colname], nan_policy='omit'),
                        100*np.sum(np.where(np.isnan(dat[colname])))/self.n_records
                    ])
                    summary_data[idfts, icolname*7:icolname*7 + 7] = summary_row

            self.summary = pd.DataFrame(summary_data, columns=summary_header)
            self.summary['TIMESTAMP'] = desired_file_tss
            self.summary.set_index('TIMESTAMP', inplace=True)
        
        return

In [42]:
converted_dirs = ["/Volumes/TempData/Bretfeld Mario/Chimney-Park-Reprocessing-Sandbox/Alex Work/Bad/Chimney/EC Processing/BB-NF/Fast/17m/Converted"]
metadata_fn = "/Volumes/TempData/Bretfeld Mario/Chimney/Site Information/Changelog_Alex_fieldnotes.xlsx"
file_length = 30
acq_freq = 10
start_time = "2021-03-11 00:00"
end_time = "2021-03-11 11:00"
site_names = ["BB-NF17"]
out_dir = "/Volumes/TempData/Bretfeld Mario/Chimney-Park-Reprocessing-Sandbox/Alex Work/Bad/Chimney/EC Processing/BB-NF/Fast/17m/Standardized"

processor = fast_processing_engine(converted_dirs, metadata_fn, file_length, acq_freq, start_time, end_time, site_names, out_dir)

In [44]:
processor.find_fast_files()
processor.metadata_template()
processor.get_fast_header_info()
a=processor.site_info['BB-NF17']['header_metadat']

BB-NF17
