In [33]:
import pandas as pd

# Read the CSV file and select only the first column
df = pd.read_csv("../../data-lake/HT/Hydrology/Woods Lake Derived Inflows/Hydro Tasmania_WOODS LAKE AT DAM_Inflow Net (m³.s)_exported 2024-07-24T143653.csv", header=None, encoding='utf-8', dtype=str)

df = df.iloc[42:,0:2]

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

df.columns = ["Date", "Data"]
df = df.dropna()

# Convert 'Data' to numeric
df['Data'] = pd.to_numeric(df['Data'], errors='coerce')
print(df)

                       Date   Data
0       2007-01-01 05:00:00 -2.181
1       2007-01-01 06:00:00  0.116
2       2007-01-01 07:00:00 -0.277
3       2007-01-01 08:00:00 -0.656
4       2007-01-01 09:00:00  1.640
...                     ...    ...
152402  2024-05-21 07:00:00 -1.002
152403  2024-05-21 08:00:00 -0.273
152404  2024-05-21 09:00:00  0.780
152405  2024-05-21 10:00:00  0.782
152406  2024-05-21 11:00:00  0.379

[152390 rows x 2 columns]


In [30]:
print(df.dtypes)

Date     object
Data    float64
dtype: object


In [34]:
import numpy as np

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Convert date to desired format
# df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop rows with NaT in 'Date' column
df = df.dropna(subset=['Date'])

df = df.sort_values(by='Date')

# Replace empty cells with NaN
df.replace("", np.nan, inplace=True)

print(df)

                      Date   Data
0      2007-01-01 05:00:00 -2.181
1      2007-01-01 06:00:00  0.116
2      2007-01-01 07:00:00 -0.277
3      2007-01-01 08:00:00 -0.656
4      2007-01-01 09:00:00  1.640
...                    ...    ...
152402 2024-05-21 07:00:00 -1.002
152403 2024-05-21 08:00:00 -0.273
152404 2024-05-21 09:00:00  0.780
152405 2024-05-21 10:00:00  0.782
152406 2024-05-21 11:00:00  0.379

[152390 rows x 2 columns]


In [35]:
# Group by 'Date' and average the 'Data' values for duplicate timestamps
df = df.groupby('Date').mean().reset_index()

# Set 'Date' as index for resampling
df.set_index('Date', inplace=True)

# Resample the DataFrame to 15-minute intervals and interpolate
df = df.resample('15T').interpolate(method='linear')

# Reset the index to convert it back to a column
df = df.reset_index()

# Assuming your DataFrame is named df
# df['Data'] = df['Data'].clip(lower=0)

# Resample the DataFrame to hourly intervals and interpolate
# df = df.resample('H').interpolate(method='linear')

print(df)

                      Date     Data
0      2007-01-01 05:00:00 -2.18100
1      2007-01-01 05:15:00 -1.60675
2      2007-01-01 05:30:00 -1.03250
3      2007-01-01 05:45:00 -0.45825
4      2007-01-01 06:00:00  0.11600
...                    ...      ...
609620 2024-05-21 10:00:00  0.78200
609621 2024-05-21 10:15:00  0.68125
609622 2024-05-21 10:30:00  0.58050
609623 2024-05-21 10:45:00  0.47975
609624 2024-05-21 11:00:00  0.37900

[609625 rows x 2 columns]


In [36]:
# Write the DataFrame to a CSV file
df.to_csv("Interpolated_storageNet_Inflow.csv", index=False)