### User input

In [None]:
decimal = '.'                                                         # decimal of the input file
sep = ';'                                                             # separator of the input file
input_dir = 'C:/Users/cace0002/AbspectroscoPY/results/df_sr_sw_1.csv' # input directory
output = 'C:/Users/cace0002/AbspectroscoPY/results/'                  # output directory
dateheadername = 'Timestamp'                                          # header of the date  
header = 0                                                            # header row number
dateparsingformat = '%Y-%m-%d %H:%M:%S'                               # format of the date 
sample_name = 'sw'                                                    # name of the sample

splitstrs = ['2018-11-19 00:00:00', '2018-11-24 12:00:00']            # specify the dates you want to use to split the dataset in periods

timestart = '2018-11-13 05:02:00'                                     # starting date
timeend = '2018-12-04 08:44:00'                                       # ending date


#To plot the time series of slope ratio, the user can modify:
sr_col = 'SR'                                                         # column to plot
fig_format = '.tiff'                                                  # format of the exported figure
dpi = 300                                                             # resolution of the exported figure

### Start environment and import data

In [None]:
output_outliers = output + '/sr_periods_outliers/'               # create a new folder that will include the outlier files
output_no_outliers = output + '/sr_periods_no_outliers/'         # create a new folder that will include the files without outliers
import os
if not os.path.exists(output_outliers): 
    os.mkdir(output_outliers)
if not os.path.exists(output_no_outliers): 
    os.mkdir(output_no_outliers)

In [None]:
import abspectroscopy_functions as abspy # Functions from the AbspectroscoPY toolbox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
from datetime import datetime as dt
import glob

df = pd.read_csv(input_dir, sep = sep, header = header, index_col = 0) 
df.index = pd.to_datetime(df.index)       # make sure time column (here index) is using time format
df

### outlier_id_drop_iqr

In [None]:
def outlier_id_drop_iqr(df_in, 
                        output_dir1,
                        output_dir2,
                        splitstrings,
                        timestart, 
                        timeend,
                        dateparsingformat,
                        label_name):
    '''
    function to split the slope ratio dataframe in different periods and identify the outliers on the basis of the interquartile range in  these periods 
    :argument df_in: dataframe in input
    :argument output_dir1: directory where storing the dataframe without outliers
    :argument output_dir2: directory where storing the outliers
    :argument splitstrings: dates to use to split the dataset in periods
    :argument timestart, timeend: starting and ending date
    :argument dateparsingformat: format of the dates 
    :argument label_name: sample name    
    :return: the slope ratio dataframe in different periods (dflist), lower and upper limits of the interquartile range used
    to detect the outliers (out1) and the outlier percentage (out2)    
    '''
    ### SPLIT THE DATAFRAME INTO DIFFERENT PERIODS
    splitstrs2 = splitstrings.copy()
    splitstrs2.insert(0,timestart)                                    # add start and end dates to the dates we want to use to split the periods
    splitstrs2.append(timeend)
    split_date = [dt.strptime(splitdates,dateparsingformat) for splitdates in splitstrs2] # convert to datetime format
    #print('dates used to split the dataframe in periods: ', splitstrs)
    nperiods = len(splitstrs2) - 1   
    dflist = []                                                       # create an empty list 
    for i in range(nperiods):                                         # append the dataframes obtained for each period to the empty list  
        df_periods = df_in.loc[(df_in.index >= split_date[i]) & (df_in.index < split_date[i+1])]
        #print('period '+ str(i+1) + ': ' + dt.strftime(split_date[i], dateparsingformat) + ' - ' + dt.strftime(split_date[i+1], dateparsingformat))
        dflist.append(df_periods)

    ### CALCULATE LOWER AND UPPER LIMITS OF THE INTERQUARTILE RANGE

    out1 = pd.DataFrame(columns=['low_lim','up_lim'])                 # create an empty table with two columns and the same number of rows as the periods and fill it with NA
    out1['low_lim'] = np.repeat('NA', nperiods, axis = 0)             # rename the two columns
    out1['up_lim'] =  np.repeat('NA', nperiods, axis = 0)

    for i in range(nperiods):                                         # for each period: 
        df = dflist[i]
        q1 = np.percentile(df['SR'], 25, interpolation = 'midpoint')  # calculate the quartiles Q1, Q2, Q3
        q2 = np.percentile(df['SR'], 50, interpolation = 'midpoint')
        q3 = np.percentile(df['SR'], 75, interpolation = 'midpoint')
        iqr = q3-q1                                                   # calculate the interquartile range IQR
        low_lim = q1 - 1.5 * iqr                                      # find the lower and upper limits 
        up_lim = q3 + 1.5 * iqr
        out1['low_lim'] [i] = low_lim                                 # get all the lower and upper limits in the empty table
        out1['up_lim'] [i] = up_lim 

    ### CALCULATE OUTLIER PERCENTAGES

    out2 = pd.DataFrame(columns=['number_outliers','outliers (%)'])   # create an empty table with two columns and the same number of rows as the periods and fill it with NA                
    out2['number_outliers'] = np.repeat('NA', nperiods, axis=0)       # rename the two columns
    out2['outliers (%)'] = np.repeat('NA', nperiods, axis=0)

    dflist_noout = []                   # create an empty list 
    for i in range(nperiods):                                         # for each period:  
        df = dflist[i]   
        df_noout = df[(df_in.SR > out1['low_lim'][i]) & (df.SR < out1['up_lim'][i])] # remove SR values lower than the lower limit and greater than the upper limit (= outliers)
        ntot = len(df)                                                # number of measurements
        df_out = df[(df.SR < out1['low_lim'][i]) | (df.SR > out1['up_lim'][i])] # find out which SR values are lower than the lower limit and greater than the upper limit (= outliers)
        nout = len(df_out)                                            # number of outliers
        out2['number_outliers'] [i] = nout                            # get the number of outliers in a table together with its relative percentage                             
        out2['outliers (%)'] [i] = nout/ntot * 100 
        df_noout.to_csv(output_dir1 + 'df_sr_' + str(label_name) + '_' + '1_no_outliers' + str(i) + '.csv', sep=sep, decimal=decimal, index=True)                                                           # save the not outliers for the different datasets   
        df_out.to_csv(output_dir2 + 'df_sr_' + str(label_name) + '_' + '1_outliers' + str(i) + '.csv', sep=sep, decimal=decimal, index=True)                                                           # save the outliers for the different datasets  
        dflist_noout.append(df_noout)

    return(dflist, out1, out2)

In [None]:
dflist, out1, out2 = outlier_id_drop_iqr(df, output_no_outliers, output_outliers, splitstrs, timestart, timeend, dateparsingformat, sample_name)
print('Lower and upper limits of the interquartile range:', '\n', out1, '\n', 'outlier percentage:', out2)

In [None]:
abspy.makeaoutplot(dflist, output_no_outliers, output, sr_col, timestart, timeend, dateparsingformat, splitstrs, sample_name)