In [1]:
import pandas as pd

# Read the CSV file and select only the first column
df = pd.read_csv("../../data-lake/HT/Hydrology/Arthurs Lake Spillway (418.1)/Inflow.csv", usecols=[0], header=None, encoding='unicode_escape')

df = df.iloc[1:,:]

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Name the first column
df.columns = ["Raw"]

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)
#print(split_data)

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)

# Extract Date and Time
df['Date'] = split_data[0] + ' ' + split_data[2]

# Extract Data value
df['Data'] = split_data[3]

# Drop the original 'Raw' column
df = df[['Date', 'Data']]

# Convert 'Data' to numeric
df['Data'] = pd.to_numeric(df['Data'], errors='coerce')
print(df)


                      Date       Data
0      08/09/2017 13:00:00   1.957236
1      08/09/2017 14:00:00  11.142556
2      08/09/2017 15:00:00   8.770267
3      08/09/2017 16:00:00   1.654670
4      08/09/2017 17:00:00  -0.717614
...                    ...        ...
58543  13/05/2024 20:00:00  -0.652542
58544  13/05/2024 21:00:00  -0.652593
58545  13/05/2024 22:00:00  -0.652644
58546  13/05/2024 23:00:00  -0.652694
58547  14/05/2024 00:00:00  -0.643066

[58548 rows x 2 columns]


In [2]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Drop rows with NaT in 'Date' column
df = df.dropna(subset=['Date'])

print(df)

                     Date       Data
0     2017-09-08 13:00:00   1.957236
1     2017-09-08 14:00:00  11.142556
2     2017-09-08 15:00:00   8.770267
3     2017-09-08 16:00:00   1.654670
4     2017-09-08 17:00:00  -0.717614
...                   ...        ...
58543 2024-05-13 20:00:00  -0.652542
58544 2024-05-13 21:00:00  -0.652593
58545 2024-05-13 22:00:00  -0.652644
58546 2024-05-13 23:00:00  -0.652694
58547 2024-05-14 00:00:00  -0.643066

[58548 rows x 2 columns]


In [3]:
# Group by 'Date' and average the 'Data' values for duplicate timestamps
df = df.groupby('Date').mean().reset_index()

# Set 'Date' as index for resampling
df.set_index('Date', inplace=True)

# Resample the DataFrame to 15-minute intervals and interpolate
df = df.resample('15T').interpolate(method='linear')

# Reset the index to convert it back to a column
df = df.reset_index()

# Assuming your DataFrame is named df
df['Data'] = df['Data'].clip(lower=0)

# Resample the DataFrame to hourly intervals and interpolate
# df = df.resample('H').interpolate(method='linear')

print(df)

                      Date       Data
0      2017-09-08 13:00:00   1.957236
1      2017-09-08 13:15:00   4.253566
2      2017-09-08 13:30:00   6.549896
3      2017-09-08 13:45:00   8.846226
4      2017-09-08 14:00:00  11.142556
...                    ...        ...
234184 2024-05-13 23:00:00   0.000000
234185 2024-05-13 23:15:00   0.000000
234186 2024-05-13 23:30:00   0.000000
234187 2024-05-13 23:45:00   0.000000
234188 2024-05-14 00:00:00   0.000000

[234189 rows x 2 columns]


In [4]:
# Write the DataFrame to a CSV file
df.to_csv("Interpolated_418.1_ArthursInflow.csv", index=False)