### User input

In [None]:
decimal = '.'                                                   # decimal of the input file
sep = ';'                                                       # separator of the input file
input_dir = 'C:/Users/cace0002/AbspectroscoPY/data_scan_fp/'    # input directory
output = 'C:/Users/cace0002/AbspectroscoPY/results/'            # output directory
filepattern = '*.csv'                                           # format of the files to import
header_rownr = 1                                                # header row number
possibledateheadernames = ['Date', 'Time', 'Date/Time', 'Timestamp','Measurement interval=120[sec] (Export-Aggregation disabled)'] # input possible headers of the date column 
dateparsingformat = '%Y-%m-%d %H:%M:%S'                         # format of the date 
ncol_expected = 223                                             # number of columns expected per file
drop_c1 = 'Status (Source:0)'                                   # name of columns to drop; input extra labels, if more than one column needs to be dropped (['variable_name1','variable_name2'])
sample_name = 'sw'                                              # name of the sample

### Start environment 

In [None]:
import abspectroscopy_functions as abspy # Functions from the AbspectroscoPY toolbox
import pandas as pd

### Define and run functions prior to abs_read

In [None]:
# Functions besides the ones reported in "abspectroscopy_functions.py":
def remove_parentheses(input_dir, rownr, removeparentheses):
    '''
    function to remove parentheses
    '''
    import re
    indata = open(input_dir, 'r')
    i = 0
    newline=None
    for line in indata:
        i = i + 1
        if (rownr == i):
            newline = line.replace(';', '\t')  
            if removeparentheses:
                newline = re.sub(r'\([^)]*\)', '', newline) 
            break            
    indata.close()
    return newline

def check_headers_name_and_order(listoffiles, rownr=2):
    '''
        function to check if the headers of the files are identical (first check) and if they have the same position (second check)

        :argument listoffiles: A list of files that will be compared against eachother
        :argument rownr: The rownr containing the headers
        :return: void
    '''
    import os
    aset = set()
    aset_includingcalibtext = set()

    for afile in listoffiles:
        indatafile=afile
        arow = remove_parentheses(indatafile,  rownr, removeparentheses=True)
        arowwithparentheses = remove_parentheses(indatafile, rownr, removeparentheses=False)
        aset.add(arow)
        aset_includingcalibtext.add(arowwithparentheses)

    for oneset in aset_includingcalibtext:
        print("****** FIRST TYPE OF CHECK *******")
        print(oneset)
    if len(aset_includingcalibtext)>1:
        print(" !!! Warning: one, but probably, many missmatching strings in headers, !!!")
        print(" !!! Warning: this may result in loss of data !!!")
        print(" -- If second type of check pass, then it is ok to read files by order (not by header) --")
    else:
        print("-- First check passed, i.e all headers are identical --")
        
    for ast in aset:
        print("\n\n****** SECOND TYPE OF CHECK *******")
        print(ast)
    if len(aset)>1:
        print(" !!! Warning: one, but probably, many missmatching order of headers !!!")
    else:
        print("-- Second check passed i.e. all headers have the same position --")
        
    print("\nFinished check")
    
def dateparse(x):
    '''
       function to convert a string to datetime using strptime () function
    '''
    parsed = pd.datetime.strptime(x, dateparsingformat)
    return parsed

def check_number_columns(listoffileswithpath, listoffilesnopath, sep, decimal, dateheadername, dateparse, ncol_expected):  
    '''
       function to check if all the files have a specific number of columns

        :argument listoffileswithpath: A list of files that will be compared against eachother.
        :argument listoffilesnopath: A list of files that will be compared against eachother.
        :argument sep: Field separator.
        :argument decimal: Notation for decimal.
        :argument dateheadername: Header name for field containing dates.
        :argument dateparse: Calling function datepares to treat date in the expected format.
        :argument ncol_expected: The expected number of columns for indata files.
        :return: void.
    '''
  
    dfdiagnostic = pd.DataFrame(columns=['nr_col','filename'])
    i=0
    totfiles = len(listoffileswithpath)
    for file, fileshort in zip(listoffileswithpath, listoffilesnopath):
        i=i+1
        if True:
            infile_csv = file
            print("Processing : "+str(i)+"/"+str(totfiles)+" "+ file) 
            endf = pd.read_csv(filepath_or_buffer=infile_csv, sep=sep, header=header_rownr, index_col=1, 
                                decimal=decimal, low_memory=False , parse_dates=[dateheadername], 
                                date_parser=dateparse)
            endf.reset_index(level=0, inplace=True)
            nrcol = len(endf.columns)
            dfdia = pd.DataFrame([[nrcol, fileshort]], columns=["nr_col","filename"])
            dfdiagnostic = dfdiagnostic.append(dfdia, ignore_index = True, sort=False)    
    print('Different number of columns from the expected one:', dfdiagnostic.loc[dfdiagnostic["nr_col"] != ncol_expected])

In [None]:
listoffileswithpath,listoffilesnopath = abspy.get_files_list(input_dir, filepattern) # get the list of files with a specific pattern
listoffileswithpath

In [None]:
dateheadername = abspy.guess_date_column(listoffileswithpath, possibledateheadernames, header_rownr+1) # determine the name of the date column using a list of possible date column names 
dateheadername

In [None]:
check_headers_name_and_order(listoffileswithpath, header_rownr+1) # check if the headers of the files are identical (first check) and if they have the same position (second check)

In [None]:
check_number_columns(listoffileswithpath, listoffilesnopath, sep, decimal, dateheadername, dateparse, ncol_expected) #check if all the files have the right number of columns

### abs_read

In [None]:
def abs_read(listoffileswithpath, 
             listoffilesnopath,
             header_rownr,                          
             dateheadername,
             drop_col):  
    '''
    function to import a list of attenuation data files as function of time
    :argument listoffileswithpath: list of files including path (output of the function "get_files_list")
    :argument listoffilesnopath: list of files without path (output of the function "get_files_list")
    :argument header_rownr: header row number
    :argument dateheadername: name of the date column (output of the function "guess_date_column")
    :argument drop_col: drop useless columns   
    :return: dataframe with the attenuation data as function of time
    '''        
    df= pd.DataFrame() 
    i=0
    totfiles = len(listoffileswithpath)
    for file, fileshort in zip(listoffileswithpath, listoffilesnopath):
        i=i+1
        if True:
            infile_csv = file
            print("Processing : "+str(i)+"/"+str(totfiles)+" "+ file) 
            endf = pd.read_csv(filepath_or_buffer=infile_csv, sep=sep, header=header_rownr, index_col=1, 
                                decimal=decimal, low_memory=False , parse_dates=[dateheadername], 
                                date_parser=dateparse)
            endf.reset_index(level=0, inplace=True)
            df = df.append(endf, ignore_index = True, sort=False)
    df = df.set_index(dateheadername)                # set the date as index
    df = df.drop(drop_col, axis=1)                   # drop useless columns
    df_out = df.copy()
    df_out.index = df_out.index.rename('Timestamp')  # rename the index column as "Timestamp"
    return(df_out)

In [None]:
df = abs_read(listoffileswithpath, listoffilesnopath, header_rownr, dateheadername, drop_c1)    # import the list of files
df.to_csv(output + 'df_' + str(sample_name) + '.csv', sep = sep, decimal = decimal, index=True) # export the dataset
df