In [1]:
import pandas as pd

# Read the CSV file and select only the first column
df = pd.read_csv("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/SpillwayDischarge.csv", usecols=[0], header=None, encoding='unicode_escape')

df = df.iloc[1:,:]

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Name the first column
df.columns = ["Raw"]

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)
#print(split_data)

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)

# Extract Date and Time
df['Date'] = split_data[0] + ' ' + split_data[2]

# Extract Data value
df['Data'] = split_data[3]

# Drop the original 'Raw' column
df = df[['Date', 'Data']]

# Convert 'Data' to numeric
df['Data'] = pd.to_numeric(df['Data'], errors='coerce')
print(df)


                      Date  Data
0      01/01/2015 01:00:00   0.0
1      01/01/2015 02:00:00   0.0
2      01/01/2015 03:00:00   0.0
3      01/01/2015 04:00:00   0.0
4      01/01/2015 05:00:00   0.0
...                    ...   ...
82099  13/05/2024 20:00:00   0.0
82100  13/05/2024 21:00:00   0.0
82101  13/05/2024 22:00:00   0.0
82102  13/05/2024 23:00:00   0.0
82103  14/05/2024 00:00:00   0.0

[82104 rows x 2 columns]


In [2]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Drop rows with NaT in 'Date' column
df = df.dropna(subset=['Date'])

print(df)

                     Date  Data
0     2015-01-01 01:00:00   0.0
1     2015-01-01 02:00:00   0.0
2     2015-01-01 03:00:00   0.0
3     2015-01-01 04:00:00   0.0
4     2015-01-01 05:00:00   0.0
...                   ...   ...
82099 2024-05-13 20:00:00   0.0
82100 2024-05-13 21:00:00   0.0
82101 2024-05-13 22:00:00   0.0
82102 2024-05-13 23:00:00   0.0
82103 2024-05-14 00:00:00   0.0

[82104 rows x 2 columns]


In [3]:
# Group by 'Date' and average the 'Data' values for duplicate timestamps
df = df.groupby('Date').mean().reset_index()

# Set 'Date' as index for resampling
df.set_index('Date', inplace=True)

# Resample the DataFrame to 15-minute intervals and interpolate
df = df.resample('15T').interpolate(method='linear')

# Reset the index to convert it back to a column
df = df.reset_index()

# Resample the DataFrame to hourly intervals and interpolate
# df = df.resample('H').interpolate(method='linear')

print(df)

                      Date  Data
0      2015-01-01 01:00:00   0.0
1      2015-01-01 01:15:00   0.0
2      2015-01-01 01:30:00   0.0
3      2015-01-01 01:45:00   0.0
4      2015-01-01 02:00:00   0.0
...                    ...   ...
328408 2024-05-13 23:00:00   0.0
328409 2024-05-13 23:15:00   0.0
328410 2024-05-13 23:30:00   0.0
328411 2024-05-13 23:45:00   0.0
328412 2024-05-14 00:00:00   0.0

[328413 rows x 2 columns]


In [4]:
# Write the DataFrame to a CSV file
df.to_csv("Interpolated_418.1_ArthursSpillway.csv", index=False)