### User input

In [None]:
decimal = '.'                                                            # decimal of the input file
sep = ';'                                                                # separator of the input file

from config import *                                                     # Personal settings of local user to set input and output directories
input_dir = input_directory + 'results/df_nonan_sw.csv'                  # input directory
output = output_directory + 'results/'                                   # output directory

dateheadername = 'Timestamp'                                             # header of the date  
header = 0                                                               # header row number
sample_name = 'sw'                                                       # name of the sample

#To plot the duplicates, the user can modify:
col_sel = '220 nm'                                                       # column to plot
timestart = '2018-11-12 16:04:00'                                        # starting date
timeend = '2018-12-03 20:44:00'                                          # ending date
title =  'duplicates_'                                                   # title of the exported figure
fig_format = '.tiff'                                                     # format of the exported figure
dpi = 300                                                                # resolution of the exported figure

### Start environment and import data

In [None]:
import abspectroscopy_functions as abspy # Functions from the AbspectroscoPY toolbox
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)

df = pd.read_csv(input_dir, sep = sep, header = header, index_col = 0)
df.index = pd.to_datetime(df.index)      # make sure time column (here index) is using time format
df

### dup_check

In [None]:
def dup_check(df_in,
             dateheadername):
    '''
    function to check and plot duplicates
    :argument df_in: dataframe in input
    :argument dateheadername: name of the date column
    :return: two dataframes with duplicates by dateheadername and by all columns
    '''    
    df_out1 = df_in.copy()
    df_out1 = df_in[df_in.duplicated()]                                              # check for duplicates by all columns (which could include also data when DST reverts to STD, in case they have an identical value)
    print('The sensor has the following duplicates by all the columns:', df_out1)

    df_out2 = df_in.copy()
    df_out2 = df_out2.drop_duplicates(subset = None, keep = "first", inplace = False)# drop second duplicate by all columns
    df_out3 = df_out2.copy()
    df_out3 = df_out3[(df_out3.duplicated(subset = [dateheadername], keep = 'first'))]# check for duplicates by dateheadername (e.g. DST)     
    if df_out3.empty == True:
        print('The sensor takes into account the DST (the data are continuous): there is no risk to drop real measurements when dropping duplicates only by dateheadername')
    else:
        print('Check if the sensor does not take into account the DST (i.e., it follows the clock, and therefore duplicates by datetime appear when DST reverts to STD): consider if to drop real measurements', df_out3)
    return(df_out1, df_out2, df_out3)   

In [None]:
df_dup = df.reset_index()
df_dup_all, df_nodup_all, df_dupdst = dup_check(df_dup, dateheadername)
df_dup_all.to_csv(output + 'duplicates_allcol_' + str(sample_name) + '.csv', sep = sep, decimal = decimal, index=False) # export duplicates by all columns
df_dupdst.to_csv(output + 'duplicates_dst_' + str(sample_name) + '.csv', sep = sep, decimal = decimal, index=False)    # export duplicates probably linked to DST reverting to STD df_dup
df_dup_all = df_dup_all.set_index('Timestamp') 

### Plot duplicates

In [None]:
#%matplotlib inline # necessary if the notebook is not configured to use the inline backend by default
%matplotlib notebook
plt.ion()
abspy.makeaplot(df_dup_all, output, col_sel, timestart, timeend, sample_name, title) # Run twice if the plot looks too small

### Drop duplicates

In [None]:
def dup_drop(df_in1,
             df_in2,   
             output_dir,
             dateheadername,
             samplename
             ):
    '''
    function to drop duplicates
    :argument df_in1: dataframe in input with no duplicates by all columns
    :argument df_in2: dataframe in input with duplicates probably linked to DST reverting to STD
    :argument output_dir: directory where storing the results
    :argument dateheadername: name of the date column
    :argument samplename: name of the file
    :return: one dataframe without duplicates by all columns and with or without duplicates probably linked to DST reverting to STD
    '''      
    answer = input()
    if answer == 'yes':
        df_out = df_in1.copy()
        idx_dst = df_in2.index                                                                                         # index of duplicates by headername 
        df_out = df_out[~df_out.index.isin(idx_dst)]                                                                   # remove these duplicates
        df_out = df_out.set_index(dateheadername)
        df_out.to_csv(output_dir + 'df_nodup_' + str(samplename) + '.csv', sep = sep, decimal = decimal, index=True)   # export the dataframe with no duplicates   
    elif answer == 'no':
        df_out = df_in1.copy()
        df_out = df_out.set_index(dateheadername)
        df_out.to_csv(output_dir + 'df_nodupall_' + str(samplename) + '.csv', sep = sep, decimal = decimal, index=True) # export the dataframe with no complete duplicates
    else:
        root = tk.Tk()
        root.withdraw()
        root.attributes("-topmost", True)

        MsgBox = tk.messagebox.showerror ('Error','You are not providing one of the two possible answers. Please input "yes" or "no".',icon = 'error')
        root.destroy()       
    return(df_out)

In [None]:
df_nodup = df_nodup_all.copy() # this dataframe is the result of dropping duplicates by all columns: 
                              # consider if to drop real measurements due to DST reverting to STD
df_nodup = dup_drop(df_nodup, df_dupdst, output, dateheadername, sample_name)
df_nodup