In [1]:
import pandas as pd

# Read the CSV file and select only the first column
df = pd.read_csv("raw/212-1-140.csv", usecols=[0], header=None, encoding='unicode_escape')

df = df.iloc[1:,:]

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Name the first column
df.columns = ["Raw"]

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)
#print(split_data)

# Split the 'Raw' column by multiple spaces
split_data = df["Raw"].str.split(r'\s+', expand=True)

# Extract Date and Time
df['Date'] = split_data[0] + ' ' + split_data[2]

# Extract Data value
df['Data'] = split_data[3]

# Drop the original 'Raw' column
df = df[['Date', 'Data']]

# Convert 'Data' to numeric
df['Data'] = pd.to_numeric(df['Data'], errors='coerce')
print(df)


                     Date      Data
0     02/01/2000 00:00:00  0.000000
1     03/01/2000 00:00:00  0.000000
2     04/01/2000 00:00:00  0.000000
3     05/01/2000 00:00:00  0.000000
4     06/01/2000 00:00:00  0.000000
...                   ...       ...
8912  27/05/2024 00:00:00  0.489494
8913  28/05/2024 00:00:00  0.488203
8914  29/05/2024 00:00:00  0.486987
8915  30/05/2024 00:00:00  0.487166
8916  31/05/2024 00:00:00  0.508655

[8917 rows x 2 columns]


In [2]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Drop rows with NaT in 'Date' column
df = df.dropna(subset=['Date'])

print(df)

           Date      Data
0    2000-01-02  0.000000
1    2000-01-03  0.000000
2    2000-01-04  0.000000
3    2000-01-05  0.000000
4    2000-01-06  0.000000
...         ...       ...
8912 2024-05-27  0.489494
8913 2024-05-28  0.488203
8914 2024-05-29  0.486987
8915 2024-05-30  0.487166
8916 2024-05-31  0.508655

[8917 rows x 2 columns]


In [3]:
# Group by 'Date' and average the 'Data' values for duplicate timestamps
df = df.groupby('Date').mean().reset_index()

# Set 'Date' as index for resampling
df.set_index('Date', inplace=True)

# Resample the DataFrame to 15-minute intervals and interpolate
df = df.resample('15T').interpolate(method='linear')

# Reset the index to convert it back to a column
df = df.reset_index()

# Assuming your DataFrame is named df
df['Data'] = df['Data'].clip(lower=0)

# Resample the DataFrame to hourly intervals and interpolate
# df = df.resample('H').interpolate(method='linear')

print(df)

                      Date      Data
0      2000-01-02 00:00:00  0.000000
1      2000-01-02 00:15:00  0.000000
2      2000-01-02 00:30:00  0.000000
3      2000-01-02 00:45:00  0.000000
4      2000-01-02 01:00:00  0.000000
...                    ...       ...
855932 2024-05-30 23:00:00  0.507760
855933 2024-05-30 23:15:00  0.507983
855934 2024-05-30 23:30:00  0.508207
855935 2024-05-30 23:45:00  0.508431
855936 2024-05-31 00:00:00  0.508655

[855937 rows x 2 columns]


In [4]:
# Write the DataFrame to a CSV file
df.to_csv("Interpolated_212-1-140_Inflow.csv", index=False)