### User input

In [None]:
decimal = '.'                                                          # decimal of the input file
sep = ';'                                                              # separator of the input file
input_dir = 'C:/Users/cace0002/AbspectroscoPY/results/df_nodup_sw.csv' # input directory
output = 'C:/Users/cace0002/AbspectroscoPY/results/'                   # output directory
dateheadername = 'Timestamp'                                           # header of the date  
header = 0                                                             # header row number
sample_name = 'sw'                                                     # name of the sample

nsamples_per_hour = 30                                                 # number of samples per hour
tshift = '0 hours 28 min'                                              # time shift required if there is any time difference between the sensor and the clock not due to when the Daylight Saving Time ends
tshift2 = '12 hours 00 min'                                            # time shift required to compare the data from different sensors

### Start environment and import data

In [None]:
import abspectroscopy_functions as abspy # Functions from the AbspectroscoPY toolbox
import tkinter as tk
from tkinter import messagebox
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)

df = pd.read_csv(input_dir, sep = sep, header = header, index_col = 0) 
df.index = pd.to_datetime(df.index)      # make sure time column (here index) is using time format
df_dst = df.copy()
df_dst

### Shift the dataset in time one hour forward when the Daylight Saving Time ends (from the second duplicate in date_time) until the Daylight Saving Time starts (sensor time delta of 63 minutes)

In [None]:
def tshift_dst(df_in, 
               dateheadername,
               nsamples_per_hour):
    '''
    function to shift the dataset in time one hour forward when the Daylight Saving Time ends    
    :argument df_in: dataframe in input
    :argument dateheadername: name of the date column
    :argument nsamples_per_hour: number of samples per hour
    :return: the dataframe shifted according to Daylight saving time
    
    
    '''
    df_in = df_in.reset_index()
    df_for_shift = df_in.copy()    
    df_for_shift['Time between samples'] = (df_for_shift[dateheadername] -                 # compute the time between samples
                                           df_for_shift[dateheadername].shift(1)).astype('timedelta64[m]')
    dst = df_for_shift[df_for_shift.duplicated(subset=[dateheadername], keep=False)].index #get duplicates by headername

    # Identify where to start shifting (i.e. index of the row where the first duplicate appears):
    n = 2*nsamples_per_hour
    dst_chunks = [dst[i * n:(i + 1) * n] for i in range((len(dst) + n - 1) // n )]         # divide the duplicates by headername in chunks
    shift_start = []
    for i,chunk in enumerate(dst_chunks):
        idxs = list(sorted(chunk))
        shift_start.append(idxs[nsamples_per_hour])

    # Identify where to end shifting (i.e. the time delta is over an hour):
    df_end = df_for_shift.index[df_for_shift['Time between samples'] == 63]
    # To check where the time gap is different from 3 min or bigger than 63:
    #df_for_shift.loc[df_for_shift['Time between samples'] != 3]
    #df_for_shift.loc[df_for_shift['Time between samples'] > 63]                            # gaps might be due to stop data acquisition for sensor cleaning
    shift_end = []
    for i in df_end:
        shift_end.append(i-1)

    # Shift:
    if shift_end[0] < shift_start[0]:                                                       # there is a jump first -> shift everything before
        shift_start.append(0)                                                               # else there is a duplicate first -> everything is good
    if shift_end[-1] < shift_start[-1]:                                                     # the last anomaly is a duplicate -> shift all the way to the end 
        shift_end.append(df_for_shift.index[-1])                                            # the last anomaly is a jump -> everything is good   
    timeshift = pd.Timedelta('1 hours') 
    df_out = df_for_shift.copy()
    for i,(start,end) in enumerate(zip(shift_start,shift_end)):
        print('Step',i+1,'\n\tstart:',start,'\n\tend:  ',end)
        print('Shift interval:  ',df_out.loc[start,dateheadername],'to',df_out.loc[end,dateheadername])  # choose only those rows where the index is in the range       
        shiftslice = (df_out[dateheadername].index >= start) & (df_out[dateheadername].index <= end)
        df_out.loc[shiftslice, dateheadername] = df_out.loc[shiftslice,dateheadername] + timeshift
        print('Shifting',sum(shiftslice),'rows')
    df_out.reset_index(inplace=True,drop=True)                                              # reset the index to get a continuous index
    df_out.set_index([dateheadername], inplace=True, drop=False)                            # set the date as index
    df_out.sort_index(axis = 0, inplace=True)                                               # sort by increasing index
    df_out = df_out.drop([dateheadername], axis=1)                                          # drop the column that now is also in index
    return(df_out) 

In [None]:
root = tk.Tk()
root.withdraw()
root.attributes("-topmost", True)

MsgBox = tk.messagebox.showwarning ('Warning','Do the data need to be time-shifted by Daylight Saving Time? Input yes or no in the following cell.', icon = 'warning')
root.destroy()

In [None]:
#In this case the data were already continuous, so we cannot and we do not need to apply the following function.

In [None]:
answer = input()
if answer == 'yes':
    df_dst = tshift_dst(df, dateheadername, nsamples_per_hour)
    df_dst.to_csv(output + 'df_dst.csv', sep = sep, decimal = decimal, index = True) # export the dst time shifted dataset  
    df_shifted = df_dst.copy()    
elif answer == 'no':
    df_shifted = df_dst.copy()
else:
    root = tk.Tk()
    root.withdraw()
    root.attributes("-topmost", True)

    MsgBox = tk.messagebox.showerror ('Error','You are not providing one of the two possible answers. Please input "yes" or "no".',icon = 'error')
    root.destroy()    

In [None]:
df_shifted.sort_index(axis = 0, inplace=True)                    # sort by increasing index
#df

### Shift the dataset in time, if there is any time difference between the sensor and the clock not due to when the Daylight Saving Time ends

In [None]:
timeshift = pd.Timedelta(tshift)
df_shifted.index = df_shifted.index + timeshift
df_shifted

### To be able to compare the sensors data of the surface water to the ones inside the plant shift the time one hour forward and account for the time the surface water needs to reach the treatment step (e.g. 11 hours)

In [None]:
timeshift2 = pd.Timedelta(tshift2)
df_shifted.index = df_shifted.index + timeshift2
df_shifted.to_csv(output + 'df_shifted_' + str(sample_name) + '.csv', sep = sep, decimal = decimal, index = True) # export the time shifted dataset 
df_shifted