### Import the dataset (from 24th April 2022 to 17 May 2022) 

In [None]:
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
plt.rcParams['figure.figsize'] = (13, 7)
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('curved_labelled_data.csv')
df[['time']] = df[['time']].apply(pd.to_datetime)
df.rename(columns = {'Unnamed: 0':'x_column' }, inplace = True)

### Plotting the dataset 

In [None]:
plt.rcParams['figure.figsize'] = (13, 7)
fig, ax = plt.subplots(1,1)
fig.patch.set_facecolor('white')
ax.scatter(df['time'],df['height'], c= df['data_label'], cmap='rainbow')
ax.grid(color = 'gray', linestyle = '--', linewidth = 0.8)
ax.set_title('Outliers and the main trend from 24th April to 17th May 2022', fontsize=18, weight='bold')
ax.set_xlabel('Time', fontsize=18, weight = 'bold')
ax.set_ylabel('Water level (m)', fontsize=18, weight = 'bold')
ax.set_xlim(['2022-04-24 00:00:00'],['2022-05-17 23:59:59'])
ax.set_ylim(0, 1.9)
ax.set_xticklabels(df['time'], fontsize=16, weight='bold')
ax.tick_params(axis='y', labelsize=16)
ax.tick_params(axis='x',which='major', labelsize=16)

## Define the date format
date_form = DateFormatter("%d-%m")
ax.xaxis.set_major_formatter(date_form)
ax.set(facecolor = "white")

plt.savefig('sample_photo_curved.png', dpi=450, orientation='portrait', bbox_inches='tight', facecolor='w',edgecolor='b',)
plt.show()

### ARIMA
- Auto-Regression — Observations are regressed on its own lagged (i.e., prior) values.
- Integrated — Data values are replaced by the difference between values.
- Moving Average — Regression errors are dependent on lagged observations.

In [None]:
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import datetime as datetime
p = 5  # lag
d = 1  # difference order
q = 0  # size of moving average window
water_level = np.array(df['height']).reshape(-1, 1)
target_clusters =np.array(df['data_label']).reshape(-1,1) 

### Train-test split 

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(water_level, test_size=0.20, shuffle=False)
history = train.tolist()
predictions = []

### water level data fitting 

In [None]:
import warnings
warnings.filterwarnings('ignore')
for t in range(len(test)):
    model = sm.tsa.arima.ARIMA(history, order=(p,d,q))
    fit = model.fit()
    pred = fit.forecast()[0]
  
    predictions.append(pred)
    history.append(test[t])
print('MSE: %.3f' % mean_squared_error(test, predictions))

In [None]:
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()

### Performance Metrics 

In [None]:
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, confusion_matrix, roc_curve, auc, ConfusionMatrixDisplay
c_matrix = confusion_matrix (y_test, y_pred)
disp = ConfusionMatrixDisplay(c_matrix )
disp.plot(cmap=plt.cm.Blues,values_format='g',)
plt.xlabel('Predicted label', weight = 'bold')
plt.ylabel('True Label', weight = 'bold')


plt.savefig('con1.png', dpi=450, orientation='portrait', bbox_inches='tight', facecolor='w',edgecolor='b',)
plt.show()

In [None]:
data_accuracy = accuracy_score (y_test, y_pred)
data_accuracy

In [None]:
data_precision = precision_score (y_test, y_pred)
data_precision

In [None]:
data_recall_score = recall_score (y_test, y_pred)
data_recall_score

In [None]:
data_f1_score = f1_score (y_test, y_pred)
data_f1_score