In [29]:
import pandas as pd

def process_data(filepath):
    df = pd.read_csv(filepath, usecols=[0], header=None, encoding='unicode_escape')
    df = df.iloc[1:,:]

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Name the first column
    df.columns = ["Raw"]
    
    # Split the 'Raw' column by multiple spaces
    split_data = df["Raw"].str.split(r'\s+', expand=True)
    
    # Extract Date and Time
    df['Date'] = split_data[0] + ' ' + split_data[2]
    
    # Extract Data value
    df['Data'] = split_data[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    
    # Drop rows with NaT in 'Date' column
    df.dropna(subset='Date', inplace=True)

    return df

flow_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/Inflow.csv")
#print(flow_df)

temp_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/Water Temp.csv")
#print(temp_df)

sal_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/salinity.csv")
#print(sal_df)

chloa_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/chloro_a.csv")
#print(chloa_df)

In [30]:
import pandas as pd

def process_processed_data(filepath):
    df = pd.read_csv(filepath, encoding='unicode_escape')

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    df = df.loc[:,["Date","Data"]]

    df.dropna(subset='Date', inplace=True)

    return df

ss_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalSuspendedSolids_profile_Data.csv")

nh4_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Ammonium_profile_Data.csv")

no3_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Nitrate_profile_Data.csv")

oxy_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_DissolvedOxygen_profile_Data.csv")

frp_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_FilteredReactivePhosphorus_profile_Data.csv")

tn_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalNitrogen_profile_Data.csv")

tp_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalPhosphorus_profile_Data.csv")

print(tp_df)

                   Date   Data
0   2015-05-15 12:20:00  0.010
1   2015-07-15 11:35:00  0.010
2   2015-09-10 10:32:00  0.010
3   2015-11-11 10:30:00  0.010
4   2016-01-28 10:35:00  0.010
5   2016-03-31 15:40:00  0.010
6   2016-05-26 12:30:00  0.010
7   2016-07-21 12:20:00  0.010
8   2016-09-14 12:40:00  0.010
9   2016-11-24 13:00:00  0.010
10  2017-01-27 15:15:00  0.010
11  2017-02-22 12:40:00  0.010
12  2017-04-28 09:20:00  0.010
13  2017-06-21 16:30:00  0.010
14  2017-07-25 17:00:00  0.010
15  2017-09-11 16:40:00  0.010
16  2017-10-18 10:15:00  0.010
17  2017-11-23 17:05:00  0.010
18  2018-01-11 18:10:00  0.010
19  2018-03-22 15:10:00  0.010
20  2018-05-08 14:20:00  0.010
21  2018-07-26 14:30:00  0.010
22  2018-09-26 12:30:00  0.010
23  2018-11-28 10:45:00  0.010
24  2019-01-09 16:35:00  0.010
25  2019-02-13 17:25:00  0.010
26  2019-03-12 16:20:00  0.010
27  2019-04-16 14:04:00  0.010
28  2019-05-22 09:30:00  0.010
29  2019-06-20 12:00:00  0.010
30  2019-07-23 17:00:00  0.010
31  2019

In [35]:
def process_dataframe(df, clip_data=True):

    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Group by 'Date' and average the 'Data' values for duplicate timestamps
    df = df.groupby('Date').mean().reset_index()
    
    # Set 'Date' as index for resampling
    df.set_index('Date', inplace=True)

    # Resample the DataFrame to 15-minute intervals and interpolate
    df = df.resample('15T').interpolate(method='linear')

    # Forward fill and backward fill to handle NaNs at the boundaries
    df = df.ffill().bfill()
    
    # Reset the index to convert it back to a column
    df = df.reset_index()
    
    # Apply clipping only if specified and if the DataFrame is flow_df
    if clip_data:
        df['Data'] = df['Data'].clip(lower=0)
    
    return df

# Assuming flow_df and temp_df are already defined DataFrames
flow_df_processed = process_dataframe(flow_df.copy())
temp_df_processed = process_dataframe(temp_df.copy(), clip_data=False)
sal_df_processed = process_dataframe(sal_df.copy())
chloa_df_processed = process_dataframe(chloa_df.copy())

ss_df_processed = process_dataframe(ss_df.copy())
nh4_df_processed = process_dataframe(nh4_df.copy())
no3_df_processed = process_dataframe(no3_df.copy())
oxy_df_processed = process_dataframe(oxy_df.copy())
frp_df_processed = process_dataframe(frp_df.copy())
tn_df_processed = process_dataframe(tn_df.copy())
tp_df_processed = process_dataframe(tp_df.copy())

print(chloa_df_processed)

                      Date      Data
0      2016-01-07 12:00:00  0.865000
1      2016-01-07 12:15:00  0.841875
2      2016-01-07 12:30:00  0.818750
3      2016-01-07 12:45:00  0.795625
4      2016-01-07 13:00:00  0.772500
...                    ...       ...
292748 2024-05-13 23:00:00  0.998928
292749 2024-05-13 23:15:00  0.984716
292750 2024-05-13 23:30:00  0.970504
292751 2024-05-13 23:45:00  0.956292
292752 2024-05-14 00:00:00  0.942080

[292753 rows x 2 columns]


In [36]:
on_df_processed = tn_df_processed
on_df_processed["Data"] = tn_df_processed["Data"] - no3_df_processed["Data"] - nh4_df_processed["Data"]

op_df_processed = tp_df_processed
op_df_processed["Data"] = tp_df_processed["Data"] - frp_df_processed["Data"]

chloa_df_processed["Data"] *= 0.001
print(chloa_df_processed)

                      Date      Data
0      2016-01-07 12:00:00  0.000865
1      2016-01-07 12:15:00  0.000842
2      2016-01-07 12:30:00  0.000819
3      2016-01-07 12:45:00  0.000796
4      2016-01-07 13:00:00  0.000772
...                    ...       ...
292748 2024-05-13 23:00:00  0.000999
292749 2024-05-13 23:15:00  0.000985
292750 2024-05-13 23:30:00  0.000971
292751 2024-05-13 23:45:00  0.000956
292752 2024-05-14 00:00:00  0.000942

[292753 rows x 2 columns]


In [37]:
# Convert Date columns to datetime format
flow_df_processed['Date'] = pd.to_datetime(flow_df_processed['Date'])
temp_df_processed['Date'] = pd.to_datetime(temp_df_processed['Date'])
sal_df_processed['Date'] = pd.to_datetime(sal_df_processed['Date'])
chloa_df_processed['Date'] = pd.to_datetime(chloa_df_processed['Date'])

# Merge based on Date
merged_df = pd.merge(flow_df_processed, temp_df_processed, on='Date', suffixes=('_flow', '_temp'), how='inner')

# Merge the result with sal_df_processed based on Date, and rename the 'Data' column to 'Data_sal'
merged_df = pd.merge(merged_df, sal_df_processed.rename(columns={'Data': 'Data_sal'}), on='Date', how='inner')

merged_df = pd.merge(merged_df, chloa_df_processed.rename(columns={'Data': 'CHLA'}), on='Date', how='inner')

# Display the merged DataFrame with required columns
# print(merged_df)

In [38]:
ss_df_processed['Date'] = pd.to_datetime(ss_df_processed['Date'])
nh4_df_processed['Date'] = pd.to_datetime(nh4_df_processed['Date'])
no3_df_processed['Date'] = pd.to_datetime(no3_df_processed['Date'])
oxy_df_processed['Date'] = pd.to_datetime(oxy_df_processed['Date'])
frp_df_processed['Date'] = pd.to_datetime(frp_df_processed['Date'])
on_df_processed['Date'] = pd.to_datetime(on_df_processed['Date'])
op_df_processed['Date'] = pd.to_datetime(op_df_processed['Date'])

ss_mean = ss_df['Data'].mean()
nh4_mean = nh4_df['Data'].mean()
no3_mean = no3_df['Data'].mean()
oxy_mean = oxy_df['Data'].mean()
frp_mean = frp_df['Data'].mean()

on_mean = on_df_processed['Data'].mean()
op_mean = op_df_processed['Data'].mean()
print(op_mean)

merged_df = pd.merge(merged_df, ss_df_processed.rename(columns={'Data': 'SS'}), on='Date', how='left')
merged_df = pd.merge(merged_df, nh4_df_processed.rename(columns={'Data': 'NH4'}), on='Date', how='left')
merged_df = pd.merge(merged_df, no3_df_processed.rename(columns={'Data': 'NO3'}), on='Date', how='left')
merged_df = pd.merge(merged_df, oxy_df_processed.rename(columns={'Data': 'OXY'}), on='Date', how='left')
merged_df = pd.merge(merged_df, frp_df_processed.rename(columns={'Data': 'FRP'}), on='Date', how='left')
merged_df = pd.merge(merged_df, on_df_processed.rename(columns={'Data': 'ON'}), on='Date', how='left')
merged_df = pd.merge(merged_df, op_df_processed.rename(columns={'Data': 'OP'}), on='Date', how='left')

merged_df =merged_df.fillna({'SS': ss_mean, 'NH4': nh4_mean, 'NO3': no3_mean, 'OXY': oxy_mean, 'FRP': frp_mean, 'ON': on_mean, 'OP': op_mean})
print(merged_df)

0.006358720023168696
                      Date  Data_flow  Data_temp  Data_sal      CHLA  \
0      2017-09-08 13:00:00   1.957236   3.610125  0.010000  0.000726   
1      2017-09-08 13:15:00   4.253566   3.611094  0.010000  0.000722   
2      2017-09-08 13:30:00   6.549896   3.612062  0.010000  0.000719   
3      2017-09-08 13:45:00   8.846226   3.613031  0.010000  0.000715   
4      2017-09-08 14:00:00  11.142556   3.614000  0.010000  0.000711   
...                    ...        ...        ...       ...       ...   
234184 2024-05-13 23:00:00   0.000000   8.563319  0.012611  0.000999   
234185 2024-05-13 23:15:00   0.000000   8.556447  0.012608  0.000985   
234186 2024-05-13 23:30:00   0.000000   8.549575  0.012605  0.000971   
234187 2024-05-13 23:45:00   0.000000   8.542702  0.012602  0.000956   
234188 2024-05-14 00:00:00   0.000000   8.535830  0.012599  0.000942   

               SS       NH4       NO3        OXY       FRP        ON        OP  
0        6.353990  0.005000  0.00

In [None]:
# Observed mean of DOC, NPOC - dis, NPOC - tot (Woods POC DOC.png), assumed
merged_df['OC'] = 6

merged_df['ZEROS'] = 0
merged_df['ONES'] = 1

In [22]:
# Write the DataFrame to a CSV file
merged_df.to_csv("inflow_woods_dam_20170908-20240514.csv", index=False)