### Notes about this project:
#For further filtering. Find the dates when redissolution occured or the flow transmitter isn't reliable
#Consider 2018 data for 105PU01 flow rate?

In [49]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from dateutil.parser import parse
from itertools import compress
from collections import OrderedDict
from datetime import date
import datetime
from datetime import timedelta
import re

## LOAD THE PI DATA and LABORATORY ANALYSIS RESULTS

In [33]:
#Sheet From Pi
#Step1: Load the Data for the Polishing Filter and 105PU01
df_Pi_FT01_PU01 = pd.read_excel(r'C:\Users\v.t.flores\Documents\PolishingPumpFlow_PDI_Pi.xlsx', sheet_name='Sheet1', index_col=False, data)
#Drop the row[1]
df_Pi_FT01_PU01 = df_Pi_FT01_PU01.drop(df_Pi_FT01_PU01.index[0])
#Set the index
df_Pi_FT01_PU01.set_index('DATETIME', inplace=True)


## DECLARE THE FUNCTIONS

In [34]:
#fcn dfn
#used in the analysis results data. cleans up the sampling time entry
def time_cleaner(x):
    time_regex = re.compile(r'[0-8a-zA-Z:]', re.IGNORECASE| re.VERBOSE|re.DOTALL)
    timeact = time_regex.findall(x)
    newtime = ''.join(timeact[0:5])
    f_time = check_ending_char(newtime)
    f_time1 = check_starting_char(f_time)
    t = parse(f_time1)
    new_str_time = str(t.hour)+':'+str(t.minute)+':'+str(t.second)
    return new_str_time


#fcn for formatting the dates from range str
def datesplitter_start(x):
    return parse(x.split('to')[0])
def datesplitter_end(x):
    return parse(x.split('to')[1])
#------------------

def remove_no_good_data_N2Dry_Feed(df): #removes the string value and fills it with the last good value
    
    df = pd.to_numeric(df, errors='coerce')
    for x in range(0, len(df)):
        if type(df[x]) == str:
            df[x] = df[x-1]
    df.fillna(method='ffill', inplace=True)
    return df

#---------------------------
def replace_no_good_data_with_LastGoodValues(df_col):
    
    df_col = pd.to_numeric(df_col, errors='coerce')
    for x in range(0, len(df_col)):
        if type(df_col[x]== str):
            if det_no_data(df_col[x]) == 'Calculation':
                df_col[x] = np.nan

    df_col.fillna(method='ffill', inplace= True)
    return df_col
#--------------------------------------------
def replace_no_good_data_with_Zero(df_col):

    df_col = pd.to_numeric(df_col, errors='coerce')
    for x in range(0, len(df_col)):
        if type(df_col[x]) == str:
            if det_no_data(df_col[x]) == 'Calculation':
                df_col[x] = 0
    df_col.fillna(value=0, inplace=True)
    
    return df_col
#--------------------------------------------
#--------------------------------------------
def replace_neg_data_with_Zero(df_col):
    df_col = pd.to_numeric(df_col, errors='coerce')
    for x in range(0, len(df_col)):
        if df_col[x] < 0:
            df_col[x] = 0
    return df_col
#--------------------------------------------

# Clean the Data Using the functions

In [75]:
df_Pi_FT01_PU01_toNumeric = ['105TK05_LVL', '105PU01A_CUR', '105PU01A_FLOW', '105FT01A_PDI',
       '105PU01B_CUR', '105PU01B_FLOW', '105FT01B_PDI', '105PU01C_CUR',
       '105PU01C_FLOW', '105FT01C_PDI', '105PU01D_CUR', '105PU01D_FLOW',
       '105FT01D_PDI', '105PU01E_CUR', '105PU01E_FLOW', '105FT01E_PDI',
       '105PU01F_CUR', '105PU01F_FLOW', '105FT01F_PDI', '105PU01G_CUR',
       '105PU01G_FLOW']
#
# convert the cell values to numeric
# This doesn't use any function. Converting to numeric makes it easier to capture errors like 'No good values'.
#
#
for i in df_Pi_FT01_PU01.columns:
    if i in df_Pi_FT01_PU01_toNumeric:
        df_Pi_FT01_PU01[i] = pd.to_numeric(df_Pi_FT01_PU01[i], errors = 'coerce')
    else:
        pass
#
# Rename the DataFrame to DF_105PU01. This is shorter and probably more appropriate since we modified it a little. I'm too verbose, eh?
DF_105PU01 = df_Pi_FT01_PU01
#
#Drop the july 30 to 31 dates because MS shutdown on this day?
df_Pi_FT01_filtered1 = df_Pi_FT01_PU01.drop(df_Pi_FT01_PU01.loc['2020-07-30':'2020-07-31'].index)
#
# Slice the df_Pi_FT01_filtered1 dataframe to select the relevant columns
df_Pi_FT01_filtered2 = df_Pi_FT01_filtered1[['105TK05_LVL', '105PU01A_CUR', '105PU01A_FLOW', '105FT01A_PDI',
       '105PU01B_CUR', '105PU01B_FLOW', '105FT01B_PDI', '105PU01C_CUR',
       '105PU01C_FLOW', '105FT01C_PDI', '105PU01D_CUR', '105PU01D_FLOW',
       '105FT01D_PDI', '105PU01E_CUR', '105PU01E_FLOW', '105FT01E_PDI',
       '105PU01F_CUR', '105PU01F_FLOW', '105FT01F_PDI', '105PU01G_CUR',
       'SEL_A', 'SEL_B', 'SEL_C', 'SEL_D',
       'SEL_E', 'SEL_F']]
#
# Drop the last row of the df_Pi_FT01_filtered2 dataframe
df_Pi_FT01_filtered3 = df_Pi_FT01_filtered2.drop(df_Pi_FT01_filtered2.index[-1])
#
#Parameter list including 105TK05 temp and 105TK05 lvl
#Save the df_Pi_FT01_filtered3 to DF_105PU01 dataframe
#Then create a dataframe for each pump by slicing the 105PU01_DF for the relevent columns: 
#           105TK05 level, 
#            pump current,
#            filter pdi, 
#            pump flow,
#            selector indicator
#
DF_105PU01 = df_Pi_FT01_filtered3


In [None]:
A_lst_DF = DF_105PU01[['105TK05_LVL','105PU01A_CUR','105FT01A_PDI', '105PU01A_FLOW', 'SEL_A']]
B_lst_DF = DF_105PU01[['105TK05_LVL','105PU01B_CUR','105FT01B_PDI', '105PU01B_FLOW', 'SEL_B']]
C_lst_DF = DF_105PU01[['105TK05_LVL','105PU01C_CUR','105FT01C_PDI', '105PU01C_FLOW', 'SEL_C']]
D_lst_DF = DF_105PU01[['105TK05_LVL','105PU01D_CUR','105FT01D_PDI', '105PU01D_FLOW', 'SEL_D']]
E_lst_DF = DF_105PU01[['105TK05_LVL','105PU01E_CUR','105FT01E_PDI', '105PU01E_FLOW', 'SEL_E']]
F_lst_DF = DF_105PU01[['105TK05_LVL','105PU01F_CUR','105FT01F_PDI', '105PU01F_FLOW', 'SEL_F']]
G_lst_DF = DF_105PU01[['105TK05_LVL','105PU01G_CUR','105FT01F_PDI', '105PU01F_FLOW', 'SEL_F']]
#
# Select the rows in the dataframe where the value for the selector indicator column is 'OFF' which means 
#        that the pump is lined up to the polishing filter and not the pump G.
#Then drop the last row because it error values in it. I don't know why.
A_lst_DF = A_lst_DF[A_lst_DF['SEL_A'] == 'OFF']
A_lst_DF = A_lst_DF[0:-1]
B_lst_DF = B_lst_DF[B_lst_DF['SEL_B'] == 'OFF']
B_lst_DF = B_lst_DF[0:-1]
C_lst_DF = C_lst_DF[C_lst_DF['SEL_C'] == 'OFF']
C_lst_DF = C_lst_DF[0:-1]
D_lst_DF = D_lst_DF[D_lst_DF['SEL_D'] == 'OFF']
D_lst_DF = D_lst_DF[0:-1]
E_lst_DF = E_lst_DF[E_lst_DF['SEL_E'] == 'OFF']
E_lst_DF = E_lst_DF[0:-1]
#F_lst_DF = F_lst_DF[F_lst_DF['SEL_F'] == 'OFF']
F_lst_DF = F_lst_DF[0:-1]

#Drop the rows where the current values are less than zero while having a flowrate indication because this implies there's 
#   something wrong with the pump.
#    This will affect the linear regression model since it will compute the theta for negative values of current.
# The dropping is done in two steps. First, the dataframe is filtered to satisfy the two conditions above.
# Second, those that satisfy the conditions are dropped.
# Third, I know I said two steps, drop the rows where the flowrate is less than zero.
A_ft_not_zero = A_lst_DF[(A_lst_DF['105PU01A_CUR'] <= 0) & (A_lst_DF['105PU01A_FLOW'] != 0)]
A_lst_DF.drop(A_ft_not_zero.index, inplace=True)
A_ft_not_negFlow = A_lst_DF[A_lst_DF['105PU01A_FLOW'] < 0]
A_lst_DF.drop(A_ft_not_negFlow.index, inplace=True)

B_ft_not_zero = B_lst_DF[(B_lst_DF['105PU01B_CUR'] <= 0) & (B_lst_DF['105PU01B_FLOW'] != 0)]
B_lst_DF.drop(B_ft_not_zero.index, inplace=True)
B_ft_not_negFlow = B_lst_DF[B_lst_DF['105PU01B_FLOW'] < 0]
B_lst_DF.drop(B_ft_not_negFlow.index, inplace=True)

C_ft_not_zero = C_lst_DF[(C_lst_DF['105PU01C_CUR'] <= 0) & (C_lst_DF['105PU01C_FLOW'] != 0)]
C_lst_DF.drop(C_ft_not_zero.index, inplace=True)
C_ft_not_negFlow = C_lst_DF[C_lst_DF['105PU01C_FLOW'] < 0]
C_lst_DF.drop(C_ft_not_negFlow.index, inplace=True)

D_ft_not_zero = D_lst_DF[(D_lst_DF['105PU01D_CUR'] <= 0) & (D_lst_DF['105PU01D_FLOW'] != 0)]
D_lst_DF.drop(D_ft_not_zero.index, inplace=True)
D_ft_not_negFlow = D_lst_DF[D_lst_DF['105PU01D_FLOW'] < 0]
D_lst_DF.drop(D_ft_not_negFlow.index, inplace=True)

E_ft_not_zero = E_lst_DF[(E_lst_DF['105PU01E_CUR'] <= 0) & (E_lst_DF['105PU01E_FLOW'] != 0)]
E_lst_DF.drop(E_ft_not_zero.index, inplace=True)
E_ft_not_negFlow = E_lst_DF[E_lst_DF['105PU01E_FLOW'] < 0]
E_lst_DF.drop(E_ft_not_negFlow.index, inplace=True)

F_ft_not_zero = F_lst_DF[(F_lst_DF['105PU01F_CUR'] <= 0) & (F_lst_DF['105PU01F_FLOW'] != 0)]
F_lst_DF.drop(F_ft_not_zero.index, inplace=True)
F_ft_not_negFlow = F_lst_DF[F_lst_DF['105PU01F_FLOW'] < 0]
F_lst_DF.drop(F_ft_not_negFlow.index, inplace=True)

In [106]:
#Select the rows with current values above 30 but below 100 amperes
# Drop any row with Nan values.
A_lst_DF = A_lst_DF[(A_lst_DF['105PU01A_CUR'] >= 30)]
A_lst_DF = A_lst_DF[(A_lst_DF['105PU01A_CUR'] <= 100)]
A_lst_DF_without_SEL = A_lst_DF.drop('SEL_A', axis=1)
A_lst_DF.dropna(how='any', inplace=True)

B_lst_DF = B_lst_DF[(B_lst_DF['105PU01B_CUR'] >= 30)]
B_lst_DF = B_lst_DF[(B_lst_DF['105PU01B_CUR'] <= 100)]
B_lst_DF_without_SEL = B_lst_DF.drop('SEL_B', axis=1)
B_lst_DF.dropna(how='any', inplace=True)

C_lst_DF = C_lst_DF[(C_lst_DF['105PU01C_CUR'] >= 30)]
C_lst_DF = C_lst_DF[(C_lst_DF['105PU01C_CUR'] <= 100)]
C_lst_DF_without_SEL = C_lst_DF.drop('SEL_C', axis=1)
C_lst_DF.dropna(how='any', inplace=True)

D_lst_DF = D_lst_DF[(D_lst_DF['105PU01D_CUR'] >= 30)]
D_lst_DF = D_lst_DF[(D_lst_DF['105PU01D_CUR'] <= 100)]
D_lst_DF_without_SEL = D_lst_DF.drop('SEL_D', axis=1)
D_lst_DF.dropna(how='any', inplace=True)

E_lst_DF = E_lst_DF[(E_lst_DF['105PU01E_CUR'] >= 30)]
E_lst_DF = E_lst_DF[(E_lst_DF['105PU01E_CUR'] <= 100)]
E_lst_DF_without_SEL = E_lst_DF.drop('SEL_E', axis=1)
E_lst_DF.dropna(how='any', inplace=True)

F_lst_DF = F_lst_DF[(F_lst_DF['105PU01F_CUR'] >= 30)]
F_lst_DF = F_lst_DF[(F_lst_DF['105PU01F_CUR'] <= 100)]
F_lst_DF_without_SEL = F_lst_DF.drop('SEL_F', axis=1)
F_lst_DF.dropna(how='any', inplace=True)

In [92]:
# After the data is filtered for all the dates where we're sure the flowmeter is reliable, export it to excel
# Export the 105PU01 FT_PDI_lvl_data to excel
A_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01A_FT_PDI_lvl_data.xlsx')
B_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01B_FT_PDI_lvl_data.xlsx')
C_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01C_FT_PDI_lvl_data.xlsx')
D_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01D_FT_PDI_lvl_data.xlsx')
E_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01E_FT_PDI_lvl_data.xlsx')
F_lst_DF.to_excel(r'C:\Users\v.t.flores\Documents\105PU01F_FT_PDI_lvl_data.xlsx')

# Load these excel files as a starting point for the next data cleaning

#### PARAMETERS: CUR, PDI, TK05LVL

In [115]:
# Separate the DF for each pump into x-set and y-set
# This will be used as arguments inside the generate linear model coefficients later on.
A_lst_xset = A_lst_DF[['105PU01A_CUR','105FT01A_PDI', '105TK05_LVL']]
A_lst_Yset = A_lst_DF['105PU01A_FLOW']
B_lst_xset = B_lst_DF[['105PU01B_CUR','105FT01B_PDI', '105TK05_LVL']]
B_lst_Yset = B_lst_DF['105PU01B_FLOW']
C_lst_xset = C_lst_DF[['105PU01C_CUR','105FT01C_PDI', '105TK05_LVL']]
C_lst_Yset = C_lst_DF['105PU01C_FLOW']
D_lst_xset = D_lst_DF[['105PU01D_CUR','105FT01D_PDI', '105TK05_LVL']]
D_lst_Yset = D_lst_DF['105PU01D_FLOW']
E_lst_xset = E_lst_DF[['105PU01E_CUR','105FT01E_PDI', '105TK05_LVL']]
E_lst_Yset = E_lst_DF['105PU01E_FLOW']
F_lst_xset = F_lst_DF[['105PU01F_CUR','105FT01F_PDI', '105TK05_LVL']]
F_lst_Yset = F_lst_DF['105PU01F_FLOW']

In [109]:
# IMPORT THE MACHINE LEARNING LIBRARIES
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#### FUNCTIONS TO GENERATE THE COEFFICIENTS AND INTERCEPTS

In [110]:
def generate_lm_coefficients(df_column_toPredict, colname_lst, pumpLetter):
    X1 = colname_lst.dropna()
    y1 = df_column_toPredict.dropna()
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=0)

    lm = LinearRegression()
    lm.fit(X1_train,y1_train)

    coeff_df_FLOW_PU01 = pd.DataFrame(zip(lm.coef_, X1.columns), columns=['105PU01'+str(pumpLetter)+'_FLOW','PU01'+str(pumpLetter)+'_Param'])
    
    return coeff_df_FLOW_PU01

#--------------------------------------
def generate_lm_intercept(df_column_toPredict, colname_lst, pumpLetter):
    X1 = colname_lst.dropna()
    y1 = df_column_toPredict.dropna()
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=0)

    lm = LinearRegression()
    lm.fit(X1_train,y1_train)

    coeff_df_FLOW_PU01 = pd.DataFrame(zip(lm.coef_, X1.columns), columns=['105PU01'+str(pumpLetter)+'_FLOW','PU01'+str(pumpLetter)+'_Param'])
    intercept_pumpflow = lm.intercept_
    
    return intercept_pumpflow

In [112]:
#Use the function to generate the linear regression coefficients and the intercept per pump
A_coefficients_df = generate_lm_coefficients(A_lst_Yset, A_lst_xset, 'A')
A_intercept = generate_lm_intercept(A_lst_Yset, A_lst_xset, 'A')
#
B_coefficients_df = generate_lm_coefficients(B_lst_Yset, B_lst_xset, 'B')
B_intercept = generate_lm_intercept(B_lst_Yset, B_lst_xset, 'B')
#
C_coefficients_df = generate_lm_coefficients(C_lst_Yset, C_lst_xset, 'C')
C_intercept = generate_lm_intercept(C_lst_Yset, C_lst_xset, 'C')
#
D_coefficients_df = generate_lm_coefficients(D_lst_Yset, D_lst_xset, 'D')
D_intercept = generate_lm_intercept(D_lst_Yset, D_lst_xset, 'D')
#
E_coefficients_df = generate_lm_coefficients(E_lst_Yset, E_lst_xset, 'E')
E_intercept = generate_lm_intercept(E_lst_Yset, E_lst_xset, 'E')
#
F_coefficients_df = generate_lm_coefficients(F_lst_Yset, F_lst_xset, 'F')
F_intercept = generate_lm_intercept(F_lst_Yset, F_lst_xset, 'F')
#
# Get the coefficients and join all of it in a single dataframe
ab = A_coefficients_df.join(B_coefficients_df, how='outer')
cd = C_coefficients_df.join(D_coefficients_df, how='outer')
ef = E_coefficients_df.join(F_coefficients_df, how='outer')
abcd = ab.join(cd, how='outer')
abcdef = abcd.join(ef, how='outer')
#
# Create a list of the intercepts
# This list contains the y intercept values for each pump flow linear regression model
lst_intercepts = [A_intercept, B_intercept, C_intercept, D_intercept, E_intercept, F_intercept]
#
#
# Create a new column called 'Param' with values 'CUR', 'PDI', 'LVL'
abcdef['Param'] = ['CUR', 'PDI', 'LVL']
# Set the index to Param
abcdef.set_index(abcdef.Param)
# Drop the PU01{}_Param.format(A to F)
abcdef = abcdef.drop(['PU01A_Param', 'PU01B_Param', 'PU01C_Param', 'PU01D_Param', 'PU01E_Param', 'PU01F_Param'], axis=1)
#
coeff_abcdef = abcdef.drop(['Param'], axis=1)

In [142]:
coeff_abcdef

Unnamed: 0_level_0,105PU01A_FLOW,105PU01B_FLOW,105PU01C_FLOW,105PU01D_FLOW,105PU01E_FLOW,105PU01F_FLOW
Param,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUR,4.095293,3.049953,4.099076,4.168788,2.903212,3.946955
PDI,-0.430348,-0.102109,-0.31084,-0.33484,-0.3079,-0.380643
LVL,0.835992,0.627547,1.144482,1.207951,1.396891,0.853874


#### Loading a new data to apply the linear model created

In [116]:
#Sheet From Pi
#Step1: Load the Data for the month. 
#This is like the test set. 
#The recent data that you want to confirm the flowrate's accuracy.
df_Pi_FT01_PU01 = pd.read_excel(r'C:\Users\v.t.flores\Documents\POLISHING_PUMP_FLOW_DATAGET_PRESENT.xlsx', sheet_name='PI_DATA', index_col=False)
#Drop the row[1]
df_Pi_FT01_PU01 = df_Pi_FT01_PU01.drop(df_Pi_FT01_PU01.index[0])
#Set the index
df_Pi_FT01_PU01.set_index('DATETIME', inplace=True)

In [138]:
# List the column names that will be cleaned using the replace no good values function
PUMP_parameters = ['105TK05_LVL','105PU01A_CUR','105FT01A_PDI',
                   '105PU01B_CUR','105FT01B_PDI','105PU01C_CUR',
                   '105FT01C_PDI','105PU01D_CUR','105FT01D_PDI',
                   '105PU01E_CUR','105FT01E_PDI','105PU01F_CUR',
                   '105FT01F_PDI']
#Clean data from no good values
#Replace the 'No good values' with zero and the negative values with zero
for param in PUMP_parameters:
    df_Pi_FT01_PU01[param] = replace_no_good_data_with_Zero(df_Pi_FT01_PU01[param])
    df_Pi_FT01_PU01[param] = replace_neg_data_with_Zero(df_Pi_FT01_PU01[param])
####----------------------------------------------------------------------------------------
df_Pi_FT01_PU01.dropna()
df_Pi_FT01_PU01.drop(df_Pi_FT01_PU01.index[-1], inplace=True) #remove the NaT value at the end of the index
print('Done. All param_df_generated', datetime.datetime.now())
#------------------------------------------------

Done. All param_df_generated 2020-08-14 15:11:55.105300


In [131]:
# Select the test parameters and create DF for each one.
# _lst_xset_PRESENT means that the data is from the test data which is most likely the recent values for the flow that you want to predict.
#
A_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01A_CUR','105FT01A_PDI', '105TK05_LVL']]
B_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01B_CUR','105FT01B_PDI', '105TK05_LVL']]
C_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01C_CUR','105FT01C_PDI', '105TK05_LVL']]
D_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01D_CUR','105FT01D_PDI', '105TK05_LVL']]
E_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01E_CUR','105FT01E_PDI', '105TK05_LVL']]
F_lst_xset_PRESENT = df_Pi_FT01_PU01[['105PU01F_CUR','105FT01F_PDI', '105TK05_LVL']]
#
#

In [161]:
A_lst_xset_PRESENT.head(2)

Unnamed: 0_level_0,105PU01A_CUR,105FT01A_PDI,105TK05_LVL
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-28 00:00:00,60.724003,0.0,88.919998
2020-05-28 00:01:00,63.226402,0.0,88.82


In [165]:
print(A_lst_xset_PRESENT.iloc[1][0])
print(A_lst_xset_PRESENT.iloc[1][1])
print(A_lst_xset_PRESENT.iloc[1][2])

63.226402282714844
0.0
88.81999969482422


In [166]:
coeff_abcdef

Unnamed: 0_level_0,105PU01A_FLOW,105PU01B_FLOW,105PU01C_FLOW,105PU01D_FLOW,105PU01E_FLOW,105PU01F_FLOW
Param,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUR,4.095293,3.049953,4.099076,4.168788,2.903212,3.946955
PDI,-0.430348,-0.102109,-0.31084,-0.33484,-0.3079,-0.380643
LVL,0.835992,0.627547,1.144482,1.207951,1.396891,0.853874


In [227]:
# The generate_pump_flow function takes the pump parameters from the _lst_xset_PRESENT dataframe
# The arguments used are:
#            pump_pi_param: this is the _lst_xset_PRESENT dataframe for each pump
#            pumplttr: str, this is the letter of the pump which will be used during the iterations for each pump
#            ntrcpt_num: int, this is the index number corresponding the pump's coefficients list which is the coeff_abcdef
def generate_pump_flow(pump_pi_param, pumplttr, ntrcpt_num):
    coefname = '105PU01' + str(pumplttr) + '_FLOW'
    colName = coefname + str('pred')
    ft01_lst = list()
    ft01_datelist = list()
    for j in range(0, len(pump_pi_param)): #----------------------------------------------- j represents '105PU01{}' rows
        pu01_flow = 0   #------------------------------------------------------------------ initialize the pu01_flow to zero
        for i in range(0, len(pump_pi_param.iloc[j])): #----------------------------------- i represents the columns CUR, PDI, LVL
            
            temp_product = pump_pi_param.iloc[j][i] * coeff_abcdef[coefname][i] #---------- note the use of coeff_abcdef. Recall this is a list.
            pu01_flow = pu01_flow + temp_product                                #---------- coeff_abcdef[105PI01{A to F}][i] the i represents the CUR,PDI, LVL
        pu01_flow_new = pu01_flow + lst_intercepts[ntrcpt_num]     #----------------------- note the use of lst_intercepts. Recall this is a list.
        ft01_lst.append(pu01_flow_new) #--------------------------------------------------- Append the value for the predicted value for flowrate
        ft01_datelist.append(pump_pi_param.index[j]) #------------------------------------- Append the datetime value for the corresponding predicted value

    pu01_flow_and_date_lst = list(zip(ft01_datelist, ft01_lst)) #-------------------------- Create a list of the predicted flowrate and datetime pair
    df_pu01_flow_and_date = pd.DataFrame(pu01_flow_and_date_lst) #------------------------- Convert the list of pairs into a dataframe
    df_pu01_flow_and_date.columns = ['Datetime', colName] #------------------------------- Rename the columns to 'datetime' and '105PU01{}_Flow'
    df_pu01_flow_and_date = df_pu01_flow_and_date.set_index('Datetime') #------------------ Set the datetime column as the index
    #df_pu01_flow_and_date[coefname] = df_pu01_flow_and_date[1]
    #df_pu01_flow_and_date = df_pu01_flow_and_date.drop(1, axis=1)
    
    return df_pu01_flow_and_date

#------------------------------------------------------------function convert df to numeric--------------------------------------------
#This function converts the df to numeric. Returns nothing. Just executes the script
def convert_to_numeric(df):
    for i in df.columns:
        try:
            df[i] = pd.to_numeric(df[i], errors='coerce')
        except:
            print(i, " has error.")
#            
#--------------------------------------------------------------End-----------------------------------------------------------------------  

In [228]:
PU01A = generate_pump_flow(A_lst_xset_PRESENT, 'A', 0)
PU01B = generate_pump_flow(B_lst_xset_PRESENT, 'B', 1)
PU01C = generate_pump_flow(C_lst_xset_PRESENT, 'C', 2)
PU01D = generate_pump_flow(D_lst_xset_PRESENT, 'D', 3)
PU01E = generate_pump_flow(E_lst_xset_PRESENT, 'E', 4)
PU01F = generate_pump_flow(F_lst_xset_PRESENT, 'F', 5)

convert_to_numeric(df_Pi_FT01_PU01)
convert_to_numeric(PU01A)
convert_to_numeric(PU01B)
convert_to_numeric(PU01C)
convert_to_numeric(PU01D)
convert_to_numeric(PU01E)
convert_to_numeric(PU01F)




In [215]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig_test = make_subplots(specs=[[{"type": "xy", "secondary_y": True}]])
fig_test.add_trace(go.Scatter(x=df_Pi_FT01_PU01.index, y=df_Pi_FT01_PU01['105PU01A_FLOW']))
fig_test.add_trace(go.Bar(x=df_Pi_FT01_PU01.index, y=abs(PU01A['105PU01A_FLOWpred']-df_Pi_FT01_PU01['105PU01A_FLOW'])), secondary_y= True)
fig_test.add_trace(go.Scatter(x=PU01A.index, y=PU01A['105PU01A_FLOWpred']), secondary_y=False)

In [229]:
fig3_test = make_subplots(specs=[[{"type": "xy", "secondary_y": True}]])
fig3_test.add_trace(go.Scatter(x=df_Pi_FT01_PU01.index, y=df_Pi_FT01_PU01['105PU01C_FLOW']))
fig3_test.add_trace(go.Bar(x=df_Pi_FT01_PU01.index, y=abs(PU01C['105PU01C_FLOWpred']-df_Pi_FT01_PU01['105PU01C_FLOW'])), secondary_y= True)
fig3_test.add_trace(go.Scatter(x=PU01C.index, y=PU01C['105PU01C_FLOWpred']), secondary_y=False)

In [135]:
print(PU01A['105PU01A_FLOW'][-1])
print(PU01B['105PU01B_FLOW'][-1])
print(PU01C['105PU01C_FLOW'][-1])
print(PU01D['105PU01D_FLOW'][-1])
print(PU01E['105PU01E_FLOW'][-1])
print(PU01F['105PU01F_FLOW'][-1])

243.82797992580578
208.02003944269964
237.94989533608035
116.46551768086022
234.7281875124813
209.4767357939657


In [109]:
#Spare code for creatng linear regression------------------------------------
# IMPORT THE MACHINE LEARNING LIBRARIES
#import numpy as np
#from matplotlib import pyplot as plt
#rom sklearn.model_selection import train_test_split
#rom sklearn.linear_model import LinearRegression
#
#X1A = A_lst_xset.dropna()
#y1A = A_lst_Yset.dropna()
#X1_train, X1_test, y1_train, y1_test = train_test_split(X1A, y1A, test_size=0.20, random_state=0)
#
#from sklearn.linear_model import LinearRegression
#lm_A = LinearRegression()
#lm_A.fit(X1_train,y1_train)
#
#coeff_df_FLOW_PU01A = pd.DataFrame(zip(lm_A.coef_, X1A.columns), columns=['105PU01A_FLOW','PU01A_Param'])
#A = lm_A.intercept_
#


# Spare code for how the generate_pump_flow function works
#
#
#ft01A_lst = list()
#ft01A_datelist = list()
#for j in range(0, len(A_lst_xset_PRESENT)):
#    pu01_flow = 0
#    for i in range(0, len(A_lst_xset_PRESENT.iloc[j])):
#        temp_product = A_lst_xset_PRESENT.iloc[j][i] * coeff_abcdef['105PU01A_FLOW'][i]
#        pu01_flow = pu01_flow + temp_product
#    pu01_flow_new = pu01_flow + lst_intercepts[1]
#    ft01A_lst.append(pu01_flow_new)
#    ft01A_datelist.append(A_lst_xset_PRESENT.index[j])
#    
#pu01_flow_and_date_lst = list(zip(ft01A_datelist, ft01A_lst))
#df_pu01_flow_and_date = pd.DataFrame(pu01_flow_and_date_lst)
#df_pu01_flow_and_date = df_pu01_flow_and_date.set_index(0)
#df_pu01_flow_and_date['105PU01A_FLOW'] = df_pu01_flow_and_date[1]
#df_pu01_flow_and_date = df_pu01_flow_and_date.drop(1, axis=1)