In [58]:
import pandas as pd

def process_data(filepath):
    df = pd.read_csv(filepath, usecols=[0], header=None, encoding='unicode_escape')
    df = df.iloc[1:,:]

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Name the first column
    df.columns = ["Raw"]
    
    # Split the 'Raw' column by multiple spaces
    split_data = df["Raw"].str.split(r'\s+', expand=True)
    
    # Extract Date and Time
    df['Date'] = split_data[0] + ' ' + split_data[2]
    
    # Extract Data value
    df['Data'] = split_data[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    
    # Drop rows with NaT in 'Date' column
    df.dropna(subset='Date', inplace=True)

    return df

flow_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/Inflow.csv")
#print(flow_df)

temp_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/Water Temp.csv")
#print(temp_df)

sal_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/salinity.csv")
#print(sal_df)

chloa_df = process_data("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/WQ at Morass Bay (418.24)/Continuous/chloro_a.csv")
#print(chloa_df)

In [63]:
import pandas as pd

def process_processed_data(filepath):
    df = pd.read_csv(filepath, encoding='unicode_escape')

    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    df = df.loc[:,["Date","Data"]]

    df["Date"] = df["Date"].apply(pd.to_datetime, format='%d/%m/%Y %H:%M', errors='coerce')

    df = df.sort_values(by='Date')

    df.dropna(subset='Date', inplace=True)

    return df

ss_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_TotalSuspendedSolids_profile_Data.csv")

nh4_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Ammonium_profile_Data.csv")

no3_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_Nitrate_profile_Data.csv")

oxy_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_DissolvedOxygen_profile_Data.csv")

# frp_df = process_processed_data("../../data-warehouse/csv/ht/alwq/ArthursLakeSpillway_FilteredReactivePhosphorus_profile_Data.csv")
# print(frp_df)

                  Date   Data
45 2015-05-15 12:20:00  0.004
44 2015-07-15 11:35:00  0.003
43 2015-09-10 10:32:00  0.005
42 2015-11-11 10:30:00  0.005
41 2016-01-28 10:35:00  0.003
40 2016-03-31 15:40:00  0.004
39 2016-05-26 12:30:00  0.004
38 2016-07-21 12:20:00  0.003
35 2016-09-14 12:40:00  0.003
36 2016-11-24 13:00:00  0.003
31 2017-01-27 15:15:00  0.003
37 2017-02-22 12:40:00  0.004
34 2017-04-28 09:20:00  0.003
33 2017-06-21 16:30:00  0.003
32 2017-07-25 17:00:00  0.003
30 2017-09-11 16:40:00  0.003
29 2017-10-18 10:15:00  0.003
28 2017-11-23 17:05:00  0.003
27 2018-01-11 18:10:00  0.004
26 2018-03-22 15:10:00  0.003
25 2018-05-08 14:20:00  0.003
24 2018-07-26 14:30:00  0.003
12 2018-09-26 12:30:00  0.003
20 2018-11-28 10:45:00  0.008
23 2019-01-09 16:35:00  0.006
21 2019-02-13 17:25:00  0.007
19 2019-03-12 16:20:00  0.003
18 2019-04-16 14:04:00  0.006
22 2019-05-22 09:30:00  0.003
17 2019-06-20 12:00:00  0.003
16 2019-07-23 17:00:00  0.003
15 2019-08-27 14:50:00  0.004
14 2019-09

In [60]:
def process_dataframe(df, clip_data=True):
    # Group by 'Date' and average the 'Data' values for duplicate timestamps
    df = df.groupby('Date').mean().reset_index()
    
    # Set 'Date' as index for resampling
    df.set_index('Date', inplace=True)
    
    # Resample the DataFrame to 15-minute intervals and interpolate
    df = df.resample('15T').interpolate(method='linear')

    # Forward fill and backward fill to handle NaNs at the boundaries
    df = df.ffill().bfill()
    
    # Reset the index to convert it back to a column
    df = df.reset_index()
    
    # Apply clipping only if specified and if the DataFrame is flow_df
    if clip_data:
        df['Data'] = df['Data'].clip(lower=0)
    
    return df

# Assuming flow_df and temp_df are already defined DataFrames
flow_df_processed = process_dataframe(flow_df.copy())
temp_df_processed = process_dataframe(temp_df.copy(), clip_data=False)
sal_df_processed = process_dataframe(sal_df.copy())
chloa_df_processed = process_dataframe(chloa_df.copy())

ss_df_processed = process_dataframe(ss_df.copy())
nh4_df_processed = process_dataframe(nh4_df.copy())
no3_df_processed = process_dataframe(no3_df.copy())
oxy_df_processed = process_dataframe(oxy_df.copy())

print(oxy_df_processed)

                      Date  Data
0      2015-05-15 12:15:00  9.46
1      2015-05-15 12:30:00  9.46
2      2015-05-15 12:45:00  9.46
3      2015-05-15 13:00:00  9.46
4      2015-05-15 13:15:00  9.46
...                    ...   ...
214077 2021-06-22 11:30:00  9.33
214078 2021-06-22 11:45:00  9.33
214079 2021-06-22 12:00:00  9.33
214080 2021-06-22 12:15:00  9.33
214081 2021-06-22 12:30:00  9.33

[214082 rows x 2 columns]


In [61]:
# Convert Date columns to datetime format
flow_df_processed['Date'] = pd.to_datetime(flow_df_processed['Date'])
temp_df_processed['Date'] = pd.to_datetime(temp_df_processed['Date'])
sal_df_processed['Date'] = pd.to_datetime(sal_df_processed['Date'])
chloa_df_processed['Date'] = pd.to_datetime(chloa_df_processed['Date'])

# Merge based on Date
merged_df = pd.merge(flow_df_processed, temp_df_processed, on='Date', suffixes=('_flow', '_temp'), how='inner')

# Merge the result with sal_df_processed based on Date, and rename the 'Data' column to 'Data_sal'
merged_df = pd.merge(merged_df, sal_df_processed.rename(columns={'Data': 'Data_sal'}), on='Date', how='inner')

merged_df = pd.merge(merged_df, chloa_df_processed.rename(columns={'Data': 'CHLA'}), on='Date', how='inner')

# Display the merged DataFrame with required columns
print(merged_df)

                      Date  Data_flow  Data_temp  Data_sal      CHLA
0      2017-09-08 13:00:00   1.957236   3.610125  0.010000  0.726250
1      2017-09-08 13:15:00   4.253566   3.611094  0.010000  0.722500
2      2017-09-08 13:30:00   6.549896   3.612062  0.010000  0.718750
3      2017-09-08 13:45:00   8.846226   3.613031  0.010000  0.715000
4      2017-09-08 14:00:00  11.142556   3.614000  0.010000  0.711250
...                    ...        ...        ...       ...       ...
234184 2024-05-13 23:00:00   0.000000   8.563319  0.012611  0.998928
234185 2024-05-13 23:15:00   0.000000   8.556447  0.012608  0.984716
234186 2024-05-13 23:30:00   0.000000   8.549575  0.012605  0.970504
234187 2024-05-13 23:45:00   0.000000   8.542702  0.012602  0.956292
234188 2024-05-14 00:00:00   0.000000   8.535830  0.012599  0.942080

[234189 rows x 5 columns]


In [62]:
ss_df_processed['Date'] = pd.to_datetime(ss_df_processed['Date'])
nh4_df_processed['Date'] = pd.to_datetime(nh4_df_processed['Date'])
no3_df_processed['Date'] = pd.to_datetime(no3_df_processed['Date'])
oxy_df_processed['Date'] = pd.to_datetime(oxy_df_processed['Date'])

ss_mean = ss_df['Data'].mean()
nh4_mean = nh4_df['Data'].mean()
no3_mean = no3_df['Data'].mean()
oxy_mean = oxy_df['Data'].mean()
print(oxy_mean)

merged_df = pd.merge(merged_df, ss_df_processed.rename(columns={'Data': 'SS'}), on='Date', how='left')
merged_df = pd.merge(merged_df, nh4_df_processed.rename(columns={'Data': 'NH4'}), on='Date', how='left')
merged_df = pd.merge(merged_df, no3_df_processed.rename(columns={'Data': 'NO3'}), on='Date', how='left')
merged_df = pd.merge(merged_df, oxy_df_processed.rename(columns={'Data': 'OXY'}), on='Date', how='left')

merged_df =merged_df.fillna({'SS': ss_mean, 'NH4': nh4_mean, 'NO3': no3_mean, 'OXY': oxy_mean})
print(merged_df)

10.558709677419355
                      Date  Data_flow  Data_temp  Data_sal      CHLA  \
0      2017-09-08 13:00:00   1.957236   3.610125  0.010000  0.726250   
1      2017-09-08 13:15:00   4.253566   3.611094  0.010000  0.722500   
2      2017-09-08 13:30:00   6.549896   3.612062  0.010000  0.718750   
3      2017-09-08 13:45:00   8.846226   3.613031  0.010000  0.715000   
4      2017-09-08 14:00:00  11.142556   3.614000  0.010000  0.711250   
...                    ...        ...        ...       ...       ...   
234184 2024-05-13 23:00:00   0.000000   8.563319  0.012611  0.998928   
234185 2024-05-13 23:15:00   0.000000   8.556447  0.012608  0.984716   
234186 2024-05-13 23:30:00   0.000000   8.549575  0.012605  0.970504   
234187 2024-05-13 23:45:00   0.000000   8.542702  0.012602  0.956292   
234188 2024-05-14 00:00:00   0.000000   8.535830  0.012599  0.942080   

               SS       NH4       NO3        OXY  
0        6.353990  0.005000  0.002000  11.142027  
1        6.353

In [None]:
# Observed mean of DOC, NPOC - dis, NPOC - tot (Woods POC DOC.png), assumed
merged_df['OC'] = 6

merged_df['ZEROS'] = 0
merged_df['ONES'] = 1

In [22]:
# Write the DataFrame to a CSV file
merged_df.to_csv("inflow_woods_dam_20170908-20240514.csv", index=False)