In [15]:
import pandas as pd

# Read the CSV file and select only the first column
df = pd.read_csv("raw/SWIFT Natural Pickup Simulation.csv", header=None, encoding='unicode_escape')

df = df.iloc[23:,:]

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

df.columns = ["Date", "Data"]

# Convert 'Data' to numeric
df['Data'] = pd.to_numeric(df['Data'], errors='coerce')
print(df)


                   Date   Data
0      18/09/2015 18:00  0.810
1      18/09/2015 19:00  0.807
2      18/09/2015 20:00  0.803
3      18/09/2015 21:00  0.799
4      18/09/2015 22:00  0.796
...                 ...    ...
76191   28/05/2024 9:00  0.274
76192  28/05/2024 10:00  0.274
76193  28/05/2024 11:00  0.274
76194  28/05/2024 12:00  0.274
76195  28/05/2024 13:00  0.274

[76196 rows x 2 columns]


In [16]:
import numpy as np

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y %H:%M', errors='coerce')

# Convert date to desired format
# df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop rows with NaT in 'Date' column
df = df.dropna(subset=['Date'])

df = df.sort_values(by='Date')

# Replace empty cells with NaN
df.replace("", np.nan, inplace=True)

print(df)

                     Date   Data
294   2015-01-10 00:00:00  0.564
295   2015-01-10 01:00:00  0.562
296   2015-01-10 02:00:00  0.561
297   2015-01-10 03:00:00  0.559
298   2015-01-10 04:00:00  0.557
...                   ...    ...
75817 2024-12-05 19:00:00  0.272
75818 2024-12-05 20:00:00  0.272
75819 2024-12-05 21:00:00  0.272
75820 2024-12-05 22:00:00  0.272
75821 2024-12-05 23:00:00  0.272

[76196 rows x 2 columns]


In [17]:
# Group by 'Date' and average the 'Data' values for duplicate timestamps
df = df.groupby('Date').mean().reset_index()

# Set 'Date' as index for resampling
df.set_index('Date', inplace=True)

# Resample the DataFrame to 15-minute intervals and interpolate
df = df.resample('15T').interpolate(method='linear')

# Reset the index to convert it back to a column
df = df.reset_index()

# Assuming your DataFrame is named df
df['Data'] = df['Data'].clip(lower=0)

# Resample the DataFrame to hourly intervals and interpolate
# df = df.resample('H').interpolate(method='linear')

print(df)

                      Date    Data
0      2015-01-10 00:00:00  0.5640
1      2015-01-10 00:15:00  0.5635
2      2015-01-10 00:30:00  0.5630
3      2015-01-10 00:45:00  0.5625
4      2015-01-10 01:00:00  0.5620
...                    ...     ...
347320 2024-12-05 22:00:00  0.2720
347321 2024-12-05 22:15:00  0.2720
347322 2024-12-05 22:30:00  0.2720
347323 2024-12-05 22:45:00  0.2720
347324 2024-12-05 23:00:00  0.2720

[347325 rows x 2 columns]


In [18]:
# Write the DataFrame to a CSV file
df.to_csv("Interpolated_SWIFTnatural_Inflow.csv", index=False)