In [184]:
import pandas as pd

def process_data(filepath):
    df = pd.read_csv(filepath, usecols=[0], header=None, encoding='unicode_escape')
    df = df.iloc[1:,:]

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Name the first column
    df.columns = ["Raw"]
    
    # Split the 'Raw' column by multiple spaces
    split_data = df["Raw"].str.split(r'\s+', expand=True)
    
    # Extract Date and Time
    df['Date'] = split_data[0] + ' ' + split_data[2]
    
    # Extract Data value
    df['Data'] = split_data[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    
    # Drop rows with NaT in 'Date' column
    df.dropna(subset='Date', inplace=True)

    return df

def process_inflow_data(filepath):
    df = pd.read_csv(filepath, header=None, encoding='unicode_escape')

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Extract Date and Time
    df['Date'] = df[0] + ' ' + df[2]
    
    # Extract Data value
    df['Data'] = df[3]

    df = df[['Date', 'Data']]
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')

    df = df.sort_values(by='Date')
    
    # Drop rows with NaT in 'Date' column
    df.dropna(subset='Date', inplace=True)

    return df

# flow_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/Inflow.csv")
#print(flow_df)
flow_df = process_inflow_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/Flow into LakeRiver Arthurs.csv")
print(flow_df)

temp_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/Water Temp.csv")
#print(temp_df)

sal_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/salinity.csv")
#print(sal_df)

chloa_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/chloro_a.csv")
#print(chloa_df)

                      Date      Data
1      1998-01-01 01:00:00  0.000000
2      1998-01-01 02:00:00  0.000000
3      1998-01-01 03:00:00  0.000000
4      1998-01-01 04:00:00  0.000000
5      1998-01-01 05:00:00  0.000000
...                    ...       ...
232545 2024-12-07 09:00:00  0.001830
232546 2024-12-07 10:00:00  0.001772
232547 2024-12-07 11:00:00  0.001763
232548 2024-12-07 12:00:00  0.001725
232549 2024-12-07 13:00:00  0.001725

[232549 rows x 2 columns]


In [185]:
# print(flow_df.iloc[162144:162174,:])
# print(flow_df_processed.iloc[648612:648632,:])
# print(flow_df_processed[flow_df_processed["Date"]=='2016-07-01'])

In [186]:
# import matplotlib.pyplot as plt
# flow_df["Date"] = pd.to_datetime(flow_df["Date"])
# plt.figure(figsize=(14,7))
# plt.plot(flow_df['Date'], flow_df['Data'])
# plt.xlabel('Date')
# plt.ylabel('Discharge/Flow')
# plt.show()

In [187]:
import pandas as pd

def process_processed_data(filepath):
    df = pd.read_csv(filepath, encoding='unicode_escape')

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    df = df.loc[:,["Date","Data"]]

    df.dropna(subset='Date', inplace=True)

    return df

ss_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalSuspendedSolids_profile_Data.csv")

nh4_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Ammonium_profile_Data.csv")

no3_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Nitrate_profile_Data.csv")

oxy_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_DissolvedOxygen_profile_Data.csv")

frp_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_FilterableReactivePhosphorus_profile_Data.csv")

tn_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalNitrogen_profile_Data.csv")

tp_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalPhosphorus_profile_Data.csv")

print(tp_df)

                   Date   Data
0   2015-05-15 12:20:00  0.010
1   2015-07-15 11:35:00  0.010
2   2015-09-10 10:32:00  0.010
3   2015-11-11 10:30:00  0.010
4   2016-01-28 10:35:00  0.010
5   2016-03-31 15:40:00  0.010
6   2016-05-26 12:30:00  0.010
7   2016-07-21 12:20:00  0.010
8   2016-09-14 12:40:00  0.010
9   2016-11-24 13:00:00  0.010
10  2017-01-27 15:15:00  0.010
11  2017-02-22 12:40:00  0.010
12  2017-04-28 09:20:00  0.010
13  2017-06-21 16:30:00  0.010
14  2017-07-25 17:00:00  0.010
15  2017-09-11 16:40:00  0.010
16  2017-10-18 10:15:00  0.010
17  2017-11-23 17:05:00  0.010
18  2018-01-11 18:10:00  0.010
19  2018-03-22 15:10:00  0.010
20  2018-05-08 14:20:00  0.010
21  2018-07-26 14:30:00  0.010
22  2018-09-26 12:30:00  0.010
23  2018-11-28 10:45:00  0.010
24  2019-01-09 16:35:00  0.010
25  2019-02-13 17:25:00  0.010
26  2019-03-12 16:20:00  0.010
27  2019-04-16 14:04:00  0.010
28  2019-05-22 09:30:00  0.010
29  2019-06-20 12:00:00  0.010
30  2019-07-23 17:00:00  0.010
31  2019

In [188]:
def process_dataframe(df, clip_data=True):

    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Group by 'Date' and average the 'Data' values for duplicate timestamps
    df = df.groupby('Date').mean().reset_index()
    
    # Set 'Date' as index for resampling
    df.set_index('Date', inplace=True)

    # Resample the DataFrame to 15-minute intervals and interpolate
    df = df.resample('15T').interpolate(method='linear')

    # Forward fill and backward fill to handle NaNs at the boundaries
    df = df.ffill().bfill()
    
    # Reset the index to convert it back to a column
    df = df.reset_index()
    
    # Apply clipping only if specified and if the DataFrame is flow_df
    if clip_data:
        df['Data'] = df['Data'].clip(lower=0)
    
    return df

# Assuming flow_df and temp_df are already defined DataFrames
flow_df_processed = process_dataframe(flow_df.copy())
temp_df_processed = process_dataframe(temp_df.copy(), clip_data=False)
sal_df_processed = process_dataframe(sal_df.copy())
chloa_df_processed = process_dataframe(chloa_df.copy())

ss_df_processed = process_dataframe(ss_df.copy())
nh4_df_processed = process_dataframe(nh4_df.copy())
no3_df_processed = process_dataframe(no3_df.copy())
oxy_df_processed = process_dataframe(oxy_df.copy())
frp_df_processed = process_dataframe(frp_df.copy())
tn_df_processed = process_dataframe(tn_df.copy())
tp_df_processed = process_dataframe(tp_df.copy())

In [189]:
on_df_processed = tn_df_processed
on_df_processed["Data"] = tn_df_processed["Data"] - no3_df_processed["Data"] - nh4_df_processed["Data"]

op_df_processed = tp_df_processed
op_df_processed["Data"] = tp_df_processed["Data"] - frp_df_processed["Data"]

chloa_df_processed["Data"] *= 0.001
print(chloa_df_processed)

                      Date      Data
0      2016-01-07 12:00:00  0.000865
1      2016-01-07 12:15:00  0.000842
2      2016-01-07 12:30:00  0.000819
3      2016-01-07 12:45:00  0.000796
4      2016-01-07 13:00:00  0.000772
...                    ...       ...
292748 2024-05-13 23:00:00  0.000999
292749 2024-05-13 23:15:00  0.000985
292750 2024-05-13 23:30:00  0.000971
292751 2024-05-13 23:45:00  0.000956
292752 2024-05-14 00:00:00  0.000942

[292753 rows x 2 columns]


In [190]:
oxy_df_processed["Data"] *= (1000/32)
nh4_df_processed["Data"] *= (1000/14)
no3_df_processed["Data"] *= (1000/14)
frp_df_processed["Data"] *= (1000/31)
on_df_processed["Data"] *= (1000/14)
op_df_processed["Data"] *= (1000/31)
chloa_df_processed["Data"] *= (1000/893.51)

print(chloa_df_processed)

                      Date      Data
0      2016-01-07 12:00:00  0.000968
1      2016-01-07 12:15:00  0.000942
2      2016-01-07 12:30:00  0.000916
3      2016-01-07 12:45:00  0.000890
4      2016-01-07 13:00:00  0.000865
...                    ...       ...
292748 2024-05-13 23:00:00  0.001118
292749 2024-05-13 23:15:00  0.001102
292750 2024-05-13 23:30:00  0.001086
292751 2024-05-13 23:45:00  0.001070
292752 2024-05-14 00:00:00  0.001054

[292753 rows x 2 columns]


In [191]:
# Convert Date columns to datetime format
flow_df_processed['Date'] = pd.to_datetime(flow_df_processed['Date'])
# print(flow_df_processed.iloc[648612:648632,:])

temp_df_processed['Date'] = pd.to_datetime(temp_df_processed['Date'])
sal_df_processed['Date'] = pd.to_datetime(sal_df_processed['Date'])
chloa_df_processed['Date'] = pd.to_datetime(chloa_df_processed['Date'])

# Merge based on Date
merged_df = pd.merge(flow_df_processed, temp_df_processed, on='Date', suffixes=('_flow', '_temp'), how='inner')
# print(merged_df.iloc[648612:648632,:])

# Merge the result with sal_df_processed based on Date, and rename the 'Data' column to 'Data_sal'
merged_df = pd.merge(merged_df, sal_df_processed.rename(columns={'Data': 'Data_sal'}), on='Date', how='inner')

merged_df = pd.merge(merged_df, chloa_df_processed.rename(columns={'Data': 'CHLA'}), on='Date', how='inner')

# Display the merged DataFrame with required columns
# print(merged_df)

In [192]:
ss_df_processed['Date'] = pd.to_datetime(ss_df_processed['Date'])
nh4_df_processed['Date'] = pd.to_datetime(nh4_df_processed['Date'])
no3_df_processed['Date'] = pd.to_datetime(no3_df_processed['Date'])
oxy_df_processed['Date'] = pd.to_datetime(oxy_df_processed['Date'])
frp_df_processed['Date'] = pd.to_datetime(frp_df_processed['Date'])
on_df_processed['Date'] = pd.to_datetime(on_df_processed['Date'])
op_df_processed['Date'] = pd.to_datetime(op_df_processed['Date'])

ss_mean = ss_df['Data'].mean()
nh4_mean = nh4_df['Data'].mean() * (1000/14)
no3_mean = no3_df['Data'].mean() * (1000/14)
oxy_mean = oxy_df['Data'].mean() * (1000/32)
frp_mean = frp_df['Data'].mean() * (1000/31)

on_mean = on_df_processed['Data'].mean()
op_mean = op_df_processed['Data'].mean()
print(op_mean)

merged_df = pd.merge(merged_df, ss_df_processed.rename(columns={'Data': 'SS'}), on='Date', how='left')
merged_df = pd.merge(merged_df, nh4_df_processed.rename(columns={'Data': 'NH4'}), on='Date', how='left')
merged_df = pd.merge(merged_df, no3_df_processed.rename(columns={'Data': 'NO3'}), on='Date', how='left')
merged_df = pd.merge(merged_df, oxy_df_processed.rename(columns={'Data': 'OXY'}), on='Date', how='left')
merged_df = pd.merge(merged_df, frp_df_processed.rename(columns={'Data': 'FRP'}), on='Date', how='left')
merged_df = pd.merge(merged_df, on_df_processed.rename(columns={'Data': 'ON'}), on='Date', how='left')
merged_df = pd.merge(merged_df, op_df_processed.rename(columns={'Data': 'OP'}), on='Date', how='left')

merged_df =merged_df.fillna({'SS': ss_mean, 'NH4': nh4_mean, 'NO3': no3_mean, 'OXY': oxy_mean, 'FRP': frp_mean, 'ON': on_mean, 'OP': op_mean})
print(merged_df)

0.2051200007473771
                      Date  Data_flow  Data_temp  Data_sal      CHLA  \
0      2016-01-07 12:00:00   0.001500  17.131250  0.020000  0.000968   
1      2016-01-07 12:15:00   0.001499  17.142500  0.020000  0.000942   
2      2016-01-07 12:30:00   0.001498  17.153750  0.020000  0.000916   
3      2016-01-07 12:45:00   0.001496  17.165000  0.020000  0.000890   
4      2016-01-07 13:00:00   0.001495  17.176250  0.020000  0.000865   
...                    ...        ...        ...       ...       ...   
292748 2024-05-13 23:00:00   0.506915   8.563319  0.012611  0.001118   
292749 2024-05-13 23:15:00   0.506914   8.556447  0.012608  0.001102   
292750 2024-05-13 23:30:00   0.506912   8.549575  0.012605  0.001086   
292751 2024-05-13 23:45:00   0.506911   8.542702  0.012602  0.001070   
292752 2024-05-14 00:00:00   0.506910   8.535830  0.012599  0.001054   

               SS       NH4       NO3         OXY       FRP         ON  \
0       10.342105  0.357143  0.193605  295

In [193]:
merged_df.isnull().any().any()

False

In [194]:
# Observed mean of DOC, NPOC - dis, NPOC - tot (Woods POC DOC.png), assumed
merged_df['OC'] = 6 * (1000/12)

merged_df['ZEROS'] = 0
merged_df['ONES'] = 1

print(merged_df)

                      Date  Data_flow  Data_temp  Data_sal      CHLA  \
0      2016-01-07 12:00:00   0.001500  17.131250  0.020000  0.000968   
1      2016-01-07 12:15:00   0.001499  17.142500  0.020000  0.000942   
2      2016-01-07 12:30:00   0.001498  17.153750  0.020000  0.000916   
3      2016-01-07 12:45:00   0.001496  17.165000  0.020000  0.000890   
4      2016-01-07 13:00:00   0.001495  17.176250  0.020000  0.000865   
...                    ...        ...        ...       ...       ...   
292748 2024-05-13 23:00:00   0.506915   8.563319  0.012611  0.001118   
292749 2024-05-13 23:15:00   0.506914   8.556447  0.012608  0.001102   
292750 2024-05-13 23:30:00   0.506912   8.549575  0.012605  0.001086   
292751 2024-05-13 23:45:00   0.506911   8.542702  0.012602  0.001070   
292752 2024-05-14 00:00:00   0.506910   8.535830  0.012599  0.001054   

               SS       NH4       NO3         OXY       FRP         ON  \
0       10.342105  0.357143  0.193605  295.625000  0.151950  

In [195]:
merged_df["Date"].min()

Timestamp('2016-01-07 12:00:00')

In [196]:
merged_df.rename(columns = {'Data_flow':'FLOW', 'Data_temp':'TEMP',
                            'Data_sal':'SAL'}, inplace = True)

# Convert the 'Date' column to datetime format with the correct format string
merged_df['Date'] = pd.to_datetime(merged_df['Date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Convert date to desired format
# merged_df['Date'] = merged_df['Date'].dt.strftime('%d/%m/%Y %H:%M:%S')

merged_df = merged_df.sort_values(by='Date')

print(merged_df)

                      Date      FLOW       TEMP       SAL      CHLA  \
0      2016-01-07 12:00:00  0.001500  17.131250  0.020000  0.000968   
1      2016-01-07 12:15:00  0.001499  17.142500  0.020000  0.000942   
2      2016-01-07 12:30:00  0.001498  17.153750  0.020000  0.000916   
3      2016-01-07 12:45:00  0.001496  17.165000  0.020000  0.000890   
4      2016-01-07 13:00:00  0.001495  17.176250  0.020000  0.000865   
...                    ...       ...        ...       ...       ...   
292748 2024-05-13 23:00:00  0.506915   8.563319  0.012611  0.001118   
292749 2024-05-13 23:15:00  0.506914   8.556447  0.012608  0.001102   
292750 2024-05-13 23:30:00  0.506912   8.549575  0.012605  0.001086   
292751 2024-05-13 23:45:00  0.506911   8.542702  0.012602  0.001070   
292752 2024-05-14 00:00:00  0.506910   8.535830  0.012599  0.001054   

               SS       NH4       NO3         OXY       FRP         ON  \
0       10.342105  0.357143  0.193605  295.625000  0.151950  21.705066   

In [197]:
# Convert the 'Date' column to datetime
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

# Format the 'Date' column to 'dd/mm/yyyy HH:MM:SS'
merged_df['Date'] = merged_df['Date'].dt.strftime('%d/%m/%Y %H:%M:%S')

print(merged_df)


                       Date      FLOW       TEMP       SAL      CHLA  \
0       07/01/2016 12:00:00  0.001500  17.131250  0.020000  0.000968   
1       07/01/2016 12:15:00  0.001499  17.142500  0.020000  0.000942   
2       07/01/2016 12:30:00  0.001498  17.153750  0.020000  0.000916   
3       07/01/2016 12:45:00  0.001496  17.165000  0.020000  0.000890   
4       07/01/2016 13:00:00  0.001495  17.176250  0.020000  0.000865   
...                     ...       ...        ...       ...       ...   
292748  13/05/2024 23:00:00  0.506915   8.563319  0.012611  0.001118   
292749  13/05/2024 23:15:00  0.506914   8.556447  0.012608  0.001102   
292750  13/05/2024 23:30:00  0.506912   8.549575  0.012605  0.001086   
292751  13/05/2024 23:45:00  0.506911   8.542702  0.012602  0.001070   
292752  14/05/2024 00:00:00  0.506910   8.535830  0.012599  0.001054   

               SS       NH4       NO3         OXY       FRP         ON  \
0       10.342105  0.357143  0.193605  295.625000  0.151950  

In [198]:
print(merged_df.tail())

                       Date      FLOW      TEMP       SAL      CHLA  \
292748  13/05/2024 23:00:00  0.506915  8.563319  0.012611  0.001118   
292749  13/05/2024 23:15:00  0.506914  8.556447  0.012608  0.001102   
292750  13/05/2024 23:30:00  0.506912  8.549575  0.012605  0.001086   
292751  13/05/2024 23:45:00  0.506911  8.542702  0.012602  0.001070   
292752  14/05/2024 00:00:00  0.506910  8.535830  0.012599  0.001054   

               SS       NH4       NO3         OXY       FRP         ON  \
292748  10.342105  0.371118  0.156832  329.959677  0.117812  20.975578   
292749  10.342105  0.371118  0.156832  329.959677  0.117812  20.975578   
292750  10.342105  0.371118  0.156832  329.959677  0.117812  20.975578   
292751  10.342105  0.371118  0.156832  329.959677  0.117812  20.975578   
292752  10.342105  0.371118  0.156832  329.959677  0.117812  20.975578   

             OP     OC  ZEROS  ONES  
292748  0.20512  500.0      0     1  
292749  0.20512  500.0      0     1  
292750  0.2051

In [199]:
# Write the DataFrame to a CSV file
merged_df.to_csv("inflow_woods_dam_20160107-20240513.csv", index=False)