In [None]:
import pandas as pd
import os
import pytz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import itertools

from datetime import datetime, timedelta
from dateutil import rrule
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from scipy import stats



In [None]:
'''
Function to cycle through the folder of Ammonium, Nitrate, Oxygen_A, Oxygen_B and Phosphate CSVs and convert them to Series.

Return: Pandas Series for each above mentioned
'''

# change the relative path according
folder_path = "../data/important/"
files = os.listdir(folder_path)

for file_name in files:
    df = pd.read_csv(f"../data/important/{file_name}")

    df.index = pd.to_datetime(df['datumBeginMeting'])
    df.index.name = None
    df.drop(columns=['datumEindeMeting', 'datumBeginMeting'], inplace=True)

    df_to_series = df["hstWaarde"]
    filename = df['historianTagnummer'].iloc[0].split('.')[0] + f'_{file_name}'

    df_to_series.to_csv(filename, header=True)
    df_to_series

In [None]:
'''
Return: Pandas Series for each above mentioned
'''
folder_path = "../data/converted/"
files = os.listdir(folder_path)

for file_name in files:
    df = pd.read_csv(f"../data/converted/{file_name}", parse_dates=True, index_col=0)

    df.index = pd.to_datetime(df.index)

    minutely_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='T')
    new_df = pd.DataFrame(index=minutely_index)

    merged_df = new_df.join(df, how='left').interpolate(method='time')

    merged_df = merged_df.groupby(merged_df.index).mean()

    merged_df.to_csv(f"../data/resampled/{file_name}")

In [None]:
df = pd.read_csv("../data/converted/EDE_B121069913_K600_ammonium.csv", index_col=0)

df.index = [pytz.timezone('Europe/Amsterdam').localize(datetime.fromisoformat(t)) for t in df.index]
df.set_index(df.index.tz_convert('UTC'), inplace=True)

In [None]:
folder_path = "../data/resampled/"
files = os.listdir(folder_path)

merged = pd.DataFrame()

for file_name in files:
    df = pd.read_csv(f"../data/resampled/{file_name}", index_col=0)
    df.index = pd.to_datetime(df.index)
    df = df.resample('15min').mean()
    df = df.rename(columns={"hstWaarde": file_name})
    df.plot()
    merged = pd.merge(df, merged, left_index=True, right_index=True, how='outer')

merged.to_csv("../data/merged.csv")

In [None]:
df = pd.read_csv(f"../data/merged.csv", index_col=0)

df_corr = df.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr)
    )
)


In [None]:
folder_path = "../data/converted/"
files = os.listdir(folder_path)

for file_name in files:
    print(file_name)
    df = pd.read_csv(f"../data/converted/{file_name}", index_col=0)
    df.index = pd.to_datetime(df.index)

    missing_timestamps = df.index[df.index.to_series().diff().dt.total_seconds() > 60].tolist()
    print(missing_timestamps)

In [None]:
df = pd.read_csv(f"../data/total_volume.csv", delimiter=";", index_col=0)
df.index = pd.to_datetime(df.index, format="%d-%m-%Y %H:%M")
df
df = df.iloc[:,0]
df.replace('(null)', np.nan, inplace=True)
df = df.str.replace(',', '.').astype(float)
df
df = df.resample('3D').mean()
df.plot()

#TODO zooming into a single day, 15min freq resample
#TODO Residual trend decomposition
#TODO identify the daily maximum after a decrease post discharge
#TODO lags, autoregressors

In [None]:
# Dry matter content
df = pd.read_parquet('../data/important/dry_matter_content.parquet')
df["datumEindeMeting"] = pd.to_datetime(df["datumBeginMeting"])



In [None]:
list_of_files = ["ammonium", "nitrate", "oxygen_a", "oxygen_b", "phosphate"]
all = {}

for f in list_of_files:
    df = pd.read_parquet(f'../data/tank1_new/{f}.parquet')
    df.index = pd.to_datetime(df['datumBeginMeting'])
    df.index.name = None
    df.drop(columns=['datumEindeMeting', 'datumBeginMeting'], inplace=True)

    df = df["hstWaarde"]
    df = df.astype(float)
    all[f] = df

In [None]:
for key, df in all.items():

    # result = seasonal_decompose(df, model='additive', period=4 * 30 * 24 * 60)
    # result.plot()
    # plt.show()


    # Define a range of AR and MA orders to try
    p_values = range(0, 5)  # AR order
    d_values = range(0, 2)  # Differencing order
    q_values = range(0, 5)  # MA order

    # Create combinations of p, d, and q values
    pdq_combinations = list(itertools.product(p_values, d_values, q_values))

    # Fit ARIMA models with different orders and select the best model
    best_aic = float("inf")
    best_order = None
    for order in pdq_combinations:
        try:
            model = ARIMA(df, order=order)
            results = model.fit()
            aic = results.aic
            if aic < best_aic:
                best_aic = aic
                best_order = order
        except:
            print("Oops")
            continue

    print(f"Best AIC: {best_aic}")
    print(f"Best Order (p, d, q): {best_order}")


In [None]:
df