In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import ast
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from itertools import groupby
from datetime import datetime, timedelta
from sklearn import metrics
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

## Converting JSON Data into CSV

In [3]:
full_data_json = pd.read_json('response1.json')  # Data output dari Get API

In [4]:
res_list = []

for i in range(len(full_data_json)):
  res = full_data_json["data"][i]
  res_list.append(res)

In [5]:
json_full = pd.DataFrame.from_dict(res_list)[['date', 'hour', 'cpu_used']]
json_full['created_at'] = pd.to_datetime(json_full['date'])
json_full['created_at'] = json_full['created_at'] + pd.to_timedelta(json_full['hour'], unit='h')
json_full['cpu_used'] = json_full['cpu_used'] / 100
full_data = json_full[['created_at', 'cpu_used']]

### Train-Test Split

In [6]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(full_data.iloc[:, :-1], full_data.iloc[:, -1], 
                                                    test_size=0.2, shuffle=False)

## Modeling: ARIMA Univariate Time Series Forecasting

### ARIMA Model Hyperparameter Tuning

In [8]:
# Evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(df_train_y, df_test_y, arima_order):
    # Prepare training dataset
    train_size = int(len(df_train_y))
    test_size = int(len(df_test_y))
    train, test = df_train_y, df_test_y
    # Make predictions
    model = ARIMA(df_train_y, order=arima_order)
    model_fit = model.fit()
    predictions = model_fit.forecast(test_size)
    # Calculate out of sample error
    rmse = (mean_squared_error(test, predictions))**0.5
    mae = mean_absolute_error(test, predictions)
    return rmse, mae
 
# Evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(df_train_y, df_test_y, p_values, d_values, q_values):
    best_score, best_cfg, best_mae = float("inf"), None, float("inf")
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    rmse, mae = evaluate_arima_model(df_train_y, df_test_y, order)
                    if rmse < best_score:
                        best_score, best_cfg, best_mae = rmse, order, mae
                    print('ARIMA%s RMSE=%.7f MAE=%.7f' % (order,rmse,mae))
                except:
                    continue
    return best_cfg
    

In [9]:
# HATI-HATI LAMA
# Evaluate parameters

p_values = [1, 2, 3, 4]
d_values = [0, 1]
q_values = [1, 2, 3, 4]

p, d, q = evaluate_models(y_train, y_test, p_values, d_values, q_values)

ARIMA(1, 0, 1) RMSE=0.0010363 MAE=0.0009478
ARIMA(1, 0, 2) RMSE=0.0010172 MAE=0.0009194
ARIMA(1, 0, 3) RMSE=0.0009979 MAE=0.0007824
ARIMA(1, 0, 4) RMSE=0.0010923 MAE=0.0010166
ARIMA(1, 1, 1) RMSE=0.0009585 MAE=0.0007975
ARIMA(1, 1, 2) RMSE=0.0009706 MAE=0.0008469
ARIMA(1, 1, 3) RMSE=0.0009654 MAE=0.0007928
ARIMA(1, 1, 4) RMSE=0.0009077 MAE=0.0007126
ARIMA(2, 0, 1) RMSE=0.0011053 MAE=0.0010119
ARIMA(2, 0, 2) RMSE=0.0012490 MAE=0.0011525
ARIMA(2, 0, 3) RMSE=0.0008586 MAE=0.0007955
ARIMA(2, 0, 4) RMSE=0.0010473 MAE=0.0009767
ARIMA(2, 1, 1) RMSE=0.0009965 MAE=0.0008620
ARIMA(2, 1, 2) RMSE=0.0009734 MAE=0.0008572
ARIMA(2, 1, 3) RMSE=0.0009738 MAE=0.0008163
ARIMA(2, 1, 4) RMSE=0.0009818 MAE=0.0007930
ARIMA(3, 0, 1) RMSE=0.0011258 MAE=0.0010360
ARIMA(3, 0, 2) RMSE=0.0010825 MAE=0.0010047
ARIMA(3, 0, 3) RMSE=0.0010081 MAE=0.0009340
ARIMA(3, 0, 4) RMSE=0.0010469 MAE=0.0009547
ARIMA(3, 1, 1) RMSE=0.0009603 MAE=0.0007205
ARIMA(3, 1, 2) RMSE=0.0009692 MAE=0.0007690
ARIMA(3, 1, 3) RMSE=0.0009649 MA

#### Forecasting Future Values of CPU Usage

In [10]:
model = ARIMA(full_data['cpu_used'], order=(p,d,q))
model_fit = model.fit()

In [11]:
# Set the number of days to be forecasted
forecasted_days = 1

# Specify the starting timestamp
latest_timestamp = full_data.iloc[-1, 0]
interval = timedelta(minutes=60)

# Specify the number of times to increment the timestamp
num_times = int(forecasted_days * 24 * (60/60))

# Create an empty list to store the timestamps
timestamps = []

# Generate the timestamps
for i in range(1, num_times+1):
    timestamps.append(latest_timestamp + i * interval)

In [12]:
# Self-predict the exiting data with the trained-model
fitting = model_fit.predict(start=0, end=len(full_data)-1)


# Forecasting new data
forecasts = model_fit.forecast(int(forecasted_days * 24 * (60/60))).T

# Outputting the forecasted data (NECESSARY DATA [PROBABLY])
forecasts = pd.DataFrame({'Timestamp': timestamps, 'Forecasts': forecasts})

In [None]:
forecasts_json=forecasts.to_json(orient='records')

## Anomaly Detection

In [14]:
df_dummy = full_data.copy()
df_dummy["fittings"] = fitting
df_dummy['Error'] = df_dummy['cpu_used'] - df_dummy['fittings']

### Dynamic Thresholding + Consecutive Occurences

In [15]:
data_copy = df_dummy.copy()
std_coef = 1.5  # Besar standar deviasi penyimpangan dari mean window
window = int(6)  # Windowing
consecutive = 3  # Frekuensi minimum kemunculan outliers berturut-turut

data_copy['mean'] = pd.Series(data_copy['Error'].rolling(window=window).mean())
data_copy['std'] = pd.Series(data_copy['Error'].rolling(window=window).std())
data_copy['up_thres'] = pd.Series(data_copy['Error'].rolling(window=window).mean()) \
                    + (std_coef * pd.Series(data_copy['Error'].rolling(window=window).std()))
data_copy['down_thres'] = pd.Series(data_copy['Error'].rolling(window=window).mean()) \
                    - (std_coef * pd.Series(data_copy['Error'].rolling(window=window).std()))

out_index = data_copy.index[(data_copy['Error'] > data_copy['up_thres']) | (data_copy['Error'] < data_copy['down_thres'])]

data_copy['outliers_bool'] = [False for _ in range(len(data_copy))]
data_copy['outliers_bool'][out_index] = True

In [16]:
greater_th = [list(g) for k, g in groupby(data_copy['outliers_bool']==True)]

for i in range(len(greater_th)):
  if greater_th[i].count(True) < consecutive:
    greater_th[i] = [False for _ in greater_th[i]]

greater_th = pd.DataFrame({"Outliers": [element for sublist in greater_th for element in sublist]})
updated_out_index = greater_th[greater_th["Outliers"]==True].index

In [17]:
data_anomaly_labeled = data_copy[['created_at', 'cpu_used']]
data_anomaly_labeled['is_anomaly'] = [False for _ in range(len(data_anomaly_labeled))]
data_anomaly_labeled['is_anomaly'].iloc[updated_out_index] = True
data_anomaly_labeled['cpu_used'] = (data_anomaly_labeled['cpu_used']*100).round(4).astype(str) + "%"

In [None]:
# Saving Anomaly Labeled Data
data_anomaly_labeled.to_csv('anomaly_detection.csv', index=False)

In [None]:
data_anomaly_labeled_json = data_anomaly_labeled.to_json(orient="records")