# Create a dataframe of NTRL Lab Analysis Results month by month

In [None]:
# This program helps you generate the dataframe for each month per area of sampling

In [14]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from dateutil.parser import parse
from itertools import compress
from collections import OrderedDict
from datetime import date
import datetime
import re
from datetime import timedelta
import openpyxl
from sklearn.metrics import mean_squared_error
import plotly.express as px

#used in the analysis results data. cleans up the sampling time entry
#---------------------
#
def time_cleaner(x):
    time_regex = re.compile(r'[0-8a-zA-Z:]', re.IGNORECASE| re.VERBOSE|re.DOTALL)
    timeact = time_regex.findall(x)
    newtime = ''.join(timeact)
    t = parse(x)
    new_str_time = str(t.hour)+':'+str(t.minute)+':'+str(t.second)
    return new_str_time
#
#-------------------
#
def ave_filler(dfname, colname):
    dfcn = dfname[colname]
    try:        
        dfcn = pd.to_numeric(dfcn, errors='coerce')
    except:
        print('error found in' + str(colname))
        pass
    try:
        dfname[colname] = dfcn.where(dfcn.notnull(), other=(dfcn.fillna(method='ffill') + dfcn.fillna(method='bfill'))/2)
    except:
        print('nan not fixed' + str(colname))   
    return dfname

#----------------------
#
#generate_df_from_analysis: this function creates dataframe by getting laboratory results by taking the excel values from
#    every row in a specified analysis column. An example of analysis column is the column for Zn AAS analysis for NTRL.
#    The critical part of this code is in making sure that the dates and time from the laboratory analysis excel file are
#    parsed and converted into the correct datetime format.
def generate_df_from_analysis(workbookTab, col_sampleDate, col_sampleTime, col_analysis, col_title):
    #get the start and end sample dates
    Smpl_day_index = list()
    Chem_Element_datelist = list()
    Chem_Element_list = list()
    Smpl_day_index = list()
    Chem_Element_datelist = list()
    Chem_Element_list = list()
    plus_oneday_list = ['0:0:0', '1:0:0', '2:0:0', '3:0:0', '4:0:0', '5:0:0']
    #gets the date day from the B column of the analysis file if there is value in the cell
    for i in range(1, workbookTab.max_row):
        if type(workbookTab[str(col_sampleDate) + str(i)].value) == int or type(workbookTab[str(col_sampleDate) + str(i)].value) == float:
            Smpl_day_index.append(i)
    #get the ending row that has date day
    for j in range(1, workbookTab.max_row):
        if type(workbookTab[str(col_sampleTime) + str(j)].value) == datetime.datetime:
            smpl_end_index = j
    for y in range(0, len(Smpl_day_index)):
        if Smpl_day_index[y] != Smpl_day_index[-1]:      
            for x in range(Smpl_day_index[y], Smpl_day_index[y+1]):
                if workbookTab[str(col_analysis)+str(x)].value is None: 
                    pass
                else:
                    if time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value)) in plus_oneday_list:
                        date = str((currentmonthyr)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                        +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                        Chem_Element_datelist.append(parse(date))  
                        chem_element = workbookTab[str(col_analysis)+str(x)].value
                        Chem_Element_list.append(chem_element)
                    else:
                        date = str((currentmonthyr)-datetime.timedelta(1)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                        +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                        Chem_Element_datelist.append(parse(date))  
                        chem_element = workbookTab[str(col_analysis)+str(x)].value
                        Chem_Element_list.append(chem_element)

        if Smpl_day_index[y] == Smpl_day_index[-1]:
            for x in range(Smpl_day_index[y], smpl_end_index+1):
                if workbookTab[str(col_analysis)+str(x)].value is None:   
                    pass
                else:
                    if time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value)) in plus_oneday_list:
                        date = str((currentmonthyr)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                        +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                        Chem_Element_datelist.append(parse(date))
                        chem_element = workbookTab[str(col_analysis)+str(x)].value
                        Chem_Element_list.append(chem_element)
                    else:
                        date = str((currentmonthyr)-datetime.timedelta(1)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                        +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                        Chem_Element_datelist.append(parse(date))
                        chem_element = workbookTab[str(col_analysis)+str(x)].value
                        Chem_Element_list.append(chem_element)

    Chem_Element_list_pair = list(zip(Chem_Element_list, Chem_Element_datelist[0:]))
    Chem_Element_df = pd.DataFrame(Chem_Element_list_pair)
    Chem_Element_df.columns = [col_title, 'DATETIME'] ##check if list title is updated
    Chem_Element_df.set_index('DATETIME', inplace=True)

    return Chem_Element_df
#-----------------------------------------------------End----------------------------------------------------------------
#
#
#------------------------------------------------------------------------------------------------------------------------
#
def generate_add_trace(figname, df_elemental, axis_bool):
    colname = df_elemental.columns[0]
    figname.add_trace(go.Scatter(x=df_elemental.index, y=df_elemental[colname], name=colname), secondary_y=axis_bool,)
#
#----------------------------------------------------End-----------------------------------------------------------------
#
#
class LabResults():
    def __init__(self, monthyear, filePath):
        #____monthyear format examle is datetime.date(2020,7,1)
        #____filepath example is '\\thgobna001\userdata\THPAL\...\ANALYSIS RESULTS 2020\7) July _2020.xlsx'
        #____add r' at the start of the filepath to apply the regex that allows the use of the filepath as raw
        #____sef.workbook is the uploaded excel workbook of the entire analysis files so it takes time to load
        #____Create an instance of the workbook by giving it a varible name with the sample format: Jan_2020
        
        import openpyxl
        import io
        with open(filePath, "rb") as f:
            in_mem_file = io.BytesIO(f.read())
        
        self.workbook = openpyxl.load_workbook(in_mem_file, data_only=True)
        self.monthyear = monthyear   
        
    def generate_df_from_analysis(self, tabname, list_lab_index):
        #____tabname is a string type name of the tab in the laboratory analysis excel file.
        #____-------Example tabnames: '104PU01', '105TK03', '106TK01'
        #____-------Inside these tabs are the specific analysis results done on the sample taken from the 'tabname' area .
        #____list_lab_index is a list with the sample format is: ['B', 'C', 'E', '104_Pb']
        #____-------the first letter 'B' is the column letter where the day of the month is written as integers 1,2,3..30
        #____-------the second letter 'C' is the column letter where the sampling time is written; e.g. '7:00:00 AM'
        #____-------the third letter 'E' is the column where the results of the specific analysis are listed
        #____-------the last item in the list is the string you'll use as the new title; e.g. '104_Pb'
        
        workbookTab = self.workbook[tabname]
        self.list_lab_index = list_lab_index
        col_sampleDate = self.list_lab_index[0]
        col_sampleTime = self.list_lab_index[1]
        col_analysis = self.list_lab_index[2]
        col_title = self.list_lab_index[3]
        currentmonthyr = self.monthyear
        
        #get the start and end sample dates
        Smpl_day_index = list()
        Chem_Element_datelist = list()
        Chem_Element_list = list()
        Smpl_day_index = list()
        Chem_Element_datelist = list()
        Chem_Element_list = list()
        plus_oneday_list = ['0:0:0', '1:0:0', '2:0:0', '3:0:0', '4:0:0', '5:0:0']
        #gets the date day from the B column of the analysis file if there is value in the cell
        for i in range(1, workbookTab.max_row):
            if type(workbookTab[str(col_sampleDate) + str(i)].value) == int or type(workbookTab[str(col_sampleDate) + str(i)].value) == float:
                Smpl_day_index.append(i)
        #get the ending row that has date day
        for j in range(1, workbookTab.max_row):
            if type(workbookTab[str(col_sampleTime) + str(j)].value) == datetime.datetime:
                smpl_end_index = j
        for y in range(0, len(Smpl_day_index)):
            if Smpl_day_index[y] != Smpl_day_index[-1]:      
                for x in range(Smpl_day_index[y], Smpl_day_index[y+1]):
                    if workbookTab[str(col_analysis)+str(x)].value is None: 
                        pass
                    else:
                        try:
                            if time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value)) in plus_oneday_list:
                                date = str((currentmonthyr)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                                +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                                Chem_Element_datelist.append(parse(date))  
                                chem_element = workbookTab[str(col_analysis)+str(x)].value
                                Chem_Element_list.append(chem_element)
                            else:
                                date = str((currentmonthyr)-datetime.timedelta(1)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                                +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                                Chem_Element_datelist.append(parse(date))  
                                chem_element = workbookTab[str(col_analysis)+str(x)].value
                                Chem_Element_list.append(chem_element)
                        except:
                            pass

            if Smpl_day_index[y] == Smpl_day_index[-1]:
                for x in range(Smpl_day_index[y], smpl_end_index+1):
                    if workbookTab[str(col_analysis)+str(x)].value is None:   
                        pass
                    else:
                        try:
                            if time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value)) in plus_oneday_list:
                                date = str((currentmonthyr)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                                +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                                Chem_Element_datelist.append(parse(date))
                                chem_element = workbookTab[str(col_analysis)+str(x)].value
                                Chem_Element_list.append(chem_element)
                            else:
                                date = str((currentmonthyr)-datetime.timedelta(1)+datetime.timedelta(workbookTab[str(col_sampleDate)+str(Smpl_day_index[y])].value))\
                                +' '+time_cleaner(str(workbookTab[str(col_sampleTime)+str(x)].value))
                                Chem_Element_datelist.append(parse(date))
                                chem_element = workbookTab[str(col_analysis)+str(x)].value
                                Chem_Element_list.append(chem_element)
                        except:
                            pass

        Chem_Element_list_pair = list(zip(Chem_Element_list, Chem_Element_datelist[0:]))
        Chem_Element_df = pd.DataFrame(Chem_Element_list_pair)
        Chem_Element_df.columns = [col_title, 'DATETIME'] ##check if list title is updated
        Chem_Element_df.set_index('DATETIME', inplace=True)

        return Chem_Element_df
#----------------------------------------------------End---------------------------------------------------------------
#
#
def join_DF(dfsList1):
    df = dfsList1[0]
    for i in range(1,len(dfsList1)):
        df = df.join(dfsList1[i], how='outer')
    return df
#
#----------------------------------------------------End-----------------------------------------------------------------
#
#
def append_monthsDF(dfsListmonthly): #takes the list of dataframes from monthly lab results of one sampling area
    df = dfsListmonthly[0]
    for i in range(1,len(dfsListmonthly)):
        df = df.append(dfsListmonthly[i])
    try:
        df.set_index('DATETIME', inplace=True)
    except:
        pass
    return df  
#
#
#
#------------------------------------------------------End-------------------------------------------------------------
#
#
def generate_combined_df_by_month(month_WB, sample_area, analysis_type):
    df_List = []
    for i in analysis_type:
        df_analysis = month_WB.generate_df_from_analysis(sample_area, i)
        df_List.append(df_analysis)
    for i in df_List:
        df = join_DF(df_List)
    return df

#
#
#----------------------------------------------------End-----------------------------------------------------------------
#
#  sample area: e.g. '106TK01'
def generate_combined_df_by_month_modified(monthNumber, analysis_type_list, sample_area ):
    month_ = LabResults(currentmonthyr_list[monthNumber-1], filepath_list[monthNumber-1])
    df_List = []
    for i in analysis_type_list:
        df_analysis = month_.generate_df_from_analysis(sample_area, i)
        df_List.append(df_analysis)
    for i in df_List:
        df = join_DF(df_List)
    return df
#
#
#--------------------------------------------------End-------------------------------------------------------------------
#
#  This uses the generate_combined_df_by_month_modified function
def generate_df_of_combined_months(lastMonthNumber, sampleSource, sampleSourceAnalysisList):
    TK01_df_list = []
    for i in range(1,lastMonthNumber+1):
        print(i)
        TK01 = generate_combined_df_by_month_modified(i, sampleSourceAnalysisList, sampleSource)
        TK01_df_list.append(TK01)
        df = append_monthsDF(TK01_df_list)
    return df
#
#--------------------------------------------------End-------------------------------------------------------------------

In [2]:
#monthyear
currentmonthyr_01 = datetime.date(2020,1,1)
currentmonthyr_02 = datetime.date(2020,2,1)
currentmonthyr_03 = datetime.date(2020,3,1)
currentmonthyr_04 = datetime.date(2020,4,1)
currentmonthyr_05 = datetime.date(2020,5,1)
currentmonthyr_06 = datetime.date(2020,6,1)
currentmonthyr_07 = datetime.date(2020,7,1)

currentmonthyr_list = [currentmonthyr_01,
                       currentmonthyr_02,
                       currentmonthyr_03, 
                       currentmonthyr_04,
                       currentmonthyr_05,
                       currentmonthyr_06,
                       currentmonthyr_07]
#filepath
filepath_01 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\JANUARY 2020\1) JANUARY_2020.xlsx'
filepath_02 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\FEBRUARY 2020\2) FEBRUARY _2020.xlsx'
filepath_03 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\MARCH 2020\3) March _2020.xlsx'
filepath_04 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\APRIL 2020\4) April _2020.xlsx'
filepath_05 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\MAY 2020\5) MAY_2020.xlsx'
filepath_06 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\JUNE 2020\6) June _2020.xlsx'
filepath_07 = r'\\thgobna001\userdata\THPAL\Department\TAGANITO\SHARED\AnalysisResult\ANALYSIS RESULTS 2020\7) July _2020.xlsx'

filepath_list = [filepath_01,
                 filepath_02,
                 filepath_03,
                 filepath_04,
                 filepath_05,
                 filepath_06,
                 filepath_07]

#list_lab_index for 106KT01

MS_TK01_Pb = ['B', 'C', 'E', '106TK01_Pb']
MS_TK01_Zn = ['B', 'C', 'F', '106TK01_Zn']
MS_TK01_Cu = ['B', 'C', 'G', '106TK01_Cu']
MS_TK01_Ni = ['B', 'C', 'H', '106TK01_Ni']
MS_TK01_Co = ['B', 'C', 'I', '106TK01_Co']
MS_TK01_Fe = ['B', 'C', 'J', '106TK01_Fe']
MS_TK01_Mn = ['B', 'C', 'K', '106TK01_Mn']
MS_TK01_Cr = ['B', 'C', 'L', '106TK01_Cr']
MS_TK01_Ca = ['B', 'C', 'M', '106TK01_Ca']
MS_TK01_Si = ['B', 'C', 'N', '106TK01_Si']
MS_TK01_Al = ['B', 'C', 'O', '106TK01_Al']
MS_TK01_Mg = ['B', 'C', 'P', '106TK01_Mg']
MS_TK01_pH = ['B', 'C', 'AH', '106TK01_pH']
MS_TK01_ORP = ['B', 'C', 'AI', '106_ORP']
MS_TK01_ZnNi = ['B', 'C', 'BC', '106_Zn/Ni']

MS_TK01_analysis_list = [MS_TK01_Pb,
                         MS_TK01_Zn,
                         MS_TK01_Cu,
                         MS_TK01_Ni,
                         MS_TK01_Co,
                         MS_TK01_Fe,
                         MS_TK01_Mn,
                         MS_TK01_Cr,
                         MS_TK01_Ca,
                         MS_TK01_Si,
                         MS_TK01_Al,
                         MS_TK01_Mg,
                         MS_TK01_pH,
                         MS_TK01_ORP,
                         MS_TK01_ZnNi]

currentmonthyr_list = [currentmonthyr_01,
                       currentmonthyr_02,
                       currentmonthyr_03,
                       currentmonthyr_04,
                       currentmonthyr_05,
                       currentmonthyr_06,
                       currentmonthyr_07]

filepath_list = [filepath_01, filepath_02, filepath_03, filepath_04, filepath_05, filepath_06, filepath_07]

In [5]:
#
# Create a dataframe for all available laboratory results data per month.
# ________Store the combined dataframes to a variable
#
MS_TK01_2020_jan_jul = generate_df_of_combined_months(7,'106TK01', MS_TK01_analysis_list)
#
# Save the MS_TK01_2020_jan_jul to an excel file. So we can load later on if we will need to plot it.
MS_TK01_2020_jan_jul.to_excel(r'C:\Users\v.t.flores\Documents\MS_TK01_2020_01_07_DF.xlsx')



In [10]:
# 
#
MS_VE04_Ni_soln = ['B', 'C', 'D', '106VE04_Ni_soln']
MS_VE04_Pb = ['B', 'C', 'S', '106VE04_Pb']
MS_VE04_Zn = ['B', 'C', 'T', '106VE04_Zn']
MS_VE04_Cu = ['B', 'C', 'U', '106VE04_Cu']
MS_VE04_Ni = ['B', 'C', 'V', '106VE04_Ni']
MS_VE04_Co = ['B', 'C', 'W', '106VE04_Co']
MS_VE04_Fe = ['B', 'C', 'X', '106VE04_Fe']
MS_VE04_Mn = ['B', 'C', 'Y', '106VE04_Mn']
MS_VE04_Cr = ['B', 'C', 'Z', '106VE04_Cr']
MS_VE04_Ca = ['B', 'C', 'AA', '106VE04_Ca']
MS_VE04_Si = ['B', 'C', 'AB', '106VE04_Si']
MS_VE04_Al = ['B', 'C', 'AC', '106VE04_Al']
MS_VE04_Mg = ['B', 'C', 'AD', '106VE04_Mg']
MS_VE04_S = ['B', 'C', 'AE', '106VE04_S']
MS_VE04_pH = ['B', 'C', 'AG', '106VE04_pH']
MS_VE04_ORP = ['B', 'C', 'AH', '106VE04_ORP']

MS_VE04_analysistype = [MS_VE04_Ni_soln,
                        MS_VE04_Pb,
                        MS_VE04_Zn,
                        MS_VE04_Cu,
                        MS_VE04_Ni,
                        MS_VE04_Co,
                        MS_VE04_Fe,
                        MS_VE04_Mn,
                        MS_VE04_Cr,
                        MS_VE04_Ca,
                        MS_VE04_Si,
                        MS_VE04_Al,
                        MS_VE04_Mg,
                        MS_VE04_S,
                        MS_VE04_pH,
                        MS_VE04_ORP]
#
#

In [15]:
#
# Create a dataframe for all available laboratory results data per month.
# ________Store the combined dataframes to a variable
#
MS_VE04_2020_jan_jul = generate_df_of_combined_months(7,'106VE04', MS_VE04_analysistype)
#
# Save the MS_TK01_2020_jan_jul to an excel file. So we can load later on if we will need to plot it.
MS_VE04_2020_jan_jul.to_excel(r'C:\Users\v.t.flores\Documents\MS_VE04_2020_01_07_DF.xlsx')



1
2
3
4
5
6
7
