<a href="https://colab.research.google.com/github/ANIZAI/Machine-Learning-based-Anomaly-Detection/blob/main/arima_anomaly_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#imports_part_1.py
from pathlib import Path # convenient way to deal w/ paths
import plotly.graph_objects as go # creates plots
import numpy as np # standard for data processing
import pandas as pd # standard for data processing
import json # we have anomalies' timestamps in json format

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#path_specification.py
# Path to the whole data from NAB git repository
#nab = Path.cwd()/'NAB'
 
# This folder contains all files w/ metrics
#data_path = nab/'data'
 
# There is also separate json file 
# w/ timestamps of anomalies in files w/ metrics
labels_filepath = '/content/drive/MyDrive/combined_labels.json'
 
# Path from data folder to the training file
training_filename = '/content/drive/MyDrive/rds_cpu_utilization_cc0c53.csv'
 
# Path from data folder to the validation file
valid_filename = '/content/drive/MyDrive/rds_cpu_utilization_e47b3b.csv'

In [None]:
#labels_loading.py
with open(labels_filepath, 'r') as f:
    anomalies_timestamps = json.load(f)


In [None]:
#read_data.py 
train = pd.read_csv(training_filename)
valid = pd.read_csv(valid_filename)


In [None]:
train.head()

Unnamed: 0,timestamp,value
0,2014-02-14 14:30:00,6.456
1,2014-02-14 14:35:00,5.816
2,2014-02-14 14:40:00,6.268
3,2014-02-14 14:45:00,5.816
4,2014-02-14 14:50:00,5.862


In [None]:
valid.head()

Unnamed: 0,timestamp,value
0,2014-04-10 00:02:00,14.012
1,2014-04-10 00:07:00,13.334
2,2014-04-10 00:12:00,15.0
3,2014-04-10 00:17:00,13.998
4,2014-04-10 00:22:00,14.332


In [None]:
from sklearn.preprocessing import StandardScaler

def parse_and_standardize(df: pd.DataFrame, scaler: StandardScaler = None):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['stand_value'] = df['value']
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(df['stand_value'].values.reshape(-1, 1))
    df['stand_value'] = scaler.transform(df['stand_value'].values.reshape(-1, 1))
    return scaler

data_scaler = parse_and_standardize(train)
parse_and_standardize(valid, data_scaler)

StandardScaler(copy=True, with_mean=True, with_std=True)

## Get anomalies from the data

In [None]:
train_anomalies = train[train['timestamp'].isin(anomalies_timestamps[training_filename])]
valid_anomalies = valid[valid['timestamp'].isin(anomalies_timestamps[valid_filename])]

In [None]:
train_anomalies

Unnamed: 0,timestamp,value,stand_value
3080,2014-02-25 07:15:00,25.1033,4.652449
3579,2014-02-27 00:50:00,19.165,3.026441


In [None]:
valid_anomalies

Unnamed: 0,timestamp,value,stand_value
946,2014-04-13 06:52:00,76.23,18.651805
2585,2014-04-18 23:27:00,25.11,4.654283


## Plot data with anomalies

### Training data

In [None]:
import plotly.graph_objects as go

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'], y=train_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

### Validation data

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'], y=valid_anomalies['value'], 
                         mode='markers', name='Anomaly',
                         marker=dict(color='green', size=13)))

## Label anomalies and non-anomalies accordingly

In [None]:
train['anomaly'] = 0
train.loc[train_anomalies.index, 'anomaly'] = 1
train.iloc[train_anomalies.index]

Unnamed: 0,timestamp,value,stand_value,anomaly
3080,2014-02-25 07:15:00,25.1033,4.652449,1
3579,2014-02-27 00:50:00,19.165,3.026441,1


In [None]:
valid['anomaly'] = 0
valid.loc[valid_anomalies.index, 'anomaly'] = 1
valid.iloc[valid_anomalies.index]

Unnamed: 0,timestamp,value,stand_value,anomaly
946,2014-04-13 06:52:00,76.23,18.651805,1
2585,2014-04-18 23:27:00,25.11,4.654283,1


In [None]:
train.head()

Unnamed: 0,timestamp,value,stand_value,anomaly
0,2014-02-14 14:30:00,6.456,-0.453498,0
1,2014-02-14 14:35:00,5.816,-0.628741,0
2,2014-02-14 14:40:00,6.268,-0.504976,0
3,2014-02-14 14:45:00,5.816,-0.628741,0
4,2014-02-14 14:50:00,5.862,-0.616145,0


In [None]:

import statsmodels.api as sm

import statsmodels.formula.api as smf

from itertools import product


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [None]:
def write_predict(train_df: pd.DataFrame, valid_df: pd.DataFrame):
    # Initial approximation of parameters
    Qs = range(0, 2)
    qs = range(0, 3)
    Ps = range(0, 3)
    ps = range(0, 3)
    D=1
    d=1
    parameters = product(ps, qs, Ps, Qs)
    parameters_list = list(parameters)
    
    # Best Model Selection
    results = []
    best_aic = float("inf")
    for param in parameters_list:
        try:
            model=sm.tsa.statespace.SARIMAX(
                train_df.value, order=(param[0], d, param[1]),
                seasonal_order=(param[2], D, param[3], 12),
                initialization='approximate_diffuse'
                ).fit()
        except ValueError:
            print('wrong parameters:', param)
            continue
        aic = model.aic
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])
    
    # Writing of the predictions for training data
    train_df['predict'] = best_model.predict()
    train_df['predict'].fillna(0, inplace=True)
    
    # Writing of the predictions for validation data
    best_model_valid = sm.tsa.statespace.SARIMAX(
        valid_df.value, order=(best_param[0], d, best_param[1]),
        seasonal_order=(best_param[2], D, best_param[3], 12),
        initialization='approximate_diffuse'
        ).fit()
    valid_df['predict'] = best_model_valid.predict()
    valid_df['predict'].fillna(0, inplace=True)
    

# Calling of the function
write_predict(train, valid)


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals



In [None]:
import plotly.graph_objects as go

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                        mode='markers', name='Ground Truth',
                        marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['predict'], 
                        mode='markers', name='Predicted Value', 
                        marker=dict(color='orange')))
fig.show()

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                        mode='markers', name='Ground Truth',
                        marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['predict'], 
                        mode='markers', name='Predicted Value', 
                        marker=dict(color='orange')))
fig.show()

In [None]:

def calculate_prediction_errors(input_data):
    return (abs(input_data['value'] -input_data['predict'])).to_numpy()

train_pred_errors = calculate_prediction_errors(train)
valid_pred_errors = calculate_prediction_errors(valid)

In [None]:
pred_error_threshold = np.mean(train_pred_errors) + 3 * np.std(train_pred_errors)


In [None]:
window=40
std_coef=5

In [None]:
# We use Series from pandas to calculate windowed errors
train_pred_errors_windowed = pd.Series(train_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the training data
train_dynamic_threshold = train_pred_errors_windowed.mean() + std_coef * train_pred_errors_windowed.std()

valid_pred_errors_windowed = pd.Series(valid_pred_errors).rolling(window=window, min_periods=1)
# Dynamic threshold for the validation data
valid_dynamic_threshold = valid_pred_errors_windowed.mean() + std_coef * valid_pred_errors_windowed.std()


In [None]:
# We are going to use this handy function for all metrics at once 
# except confusion matrix
from sklearn.metrics import precision_recall_fscore_support

def calculate_metrics(
    ground_truth: pd.DataFrame, anomalies_idxs: list
    ):
    predictions = pd.DataFrame(
        index=range(len(ground_truth)), 
        columns=['anomaly_predicted']
    )
    predictions['anomaly_predicted'] = 0
    predictions.iloc[anomalies_idxs] = 1
    
    # Calculation of the confusion matrix can be done using pandas
    confusion_matrix = pd.crosstab(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_predicted'], 
        margins=True
    )
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth.loc[:, 'anomaly_label'],
        predictions['anomaly_predicted'], 
        beta=2., 
        average='binary'
    )
    return confusion_matrix, precision, recall, f1

In [None]:
def detect_anomalies(pred_error_threshold,df):
    # Calculate errors for the gicen data
    test_reconstruction_errors = calculate_prediction_errors(df)
    # Filter errors w/ the threshold
    predicted_anomalies = list(
        map(lambda v: 1 if v > pred_error_threshold else 0,
        test_reconstruction_errors)
    )
    df['anomaly_predicted'] = predicted_anomalies
    # Extract indexes of the filtered anomalies
    indexes = [i for i, x in enumerate(predicted_anomalies) if x == 1]
    return indexes



In [None]:
train_anomalies_idxs = detect_anomalies(
    pred_error_threshold, train
)
train_anomalies_idxs

[0,
 12,
 13,
 3080,
 3081,
 3085,
 3087,
 3088,
 3129,
 3158,
 3176,
 3200,
 3294,
 3296,
 3305,
 3320,
 3376,
 3419,
 3431,
 3441,
 3446,
 3462,
 3514,
 3578,
 3579,
 3580,
 3664,
 3668,
 3672,
 3681,
 3705,
 3711,
 3719,
 3733,
 3763,
 3843,
 3888,
 3893,
 3900,
 3938,
 3952,
 3971,
 3993,
 4022,
 4023]

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'], y=train_anomalies['value'], 
                         mode='markers', name='Real Anomaly',
                         marker=dict(color='green', size=13)))
fig.add_trace(go.Scatter(x=train['timestamp'][train_anomalies_idxs],
                         y=train['value'][train_anomalies_idxs], 
                         mode='markers', name='Detected Anomaly',
                         marker=dict(color='red', size=7)))

In [None]:
valid_anomalies_idxs = detect_anomalies(
    pred_error_threshold, valid
)
valid_anomalies_idxs

[0,
 12,
 13,
 94,
 193,
 382,
 671,
 815,
 946,
 947,
 948,
 949,
 950,
 951,
 952,
 953,
 954,
 955,
 956,
 958,
 970,
 1006,
 1018,
 1042,
 1054,
 1066,
 1150,
 1162,
 1247,
 1480,
 1643,
 1823,
 1941,
 2062,
 2399,
 2543,
 2585,
 2586,
 2587,
 2588,
 2589,
 2590,
 2591,
 2592,
 2593,
 2594,
 2595,
 2598,
 2606,
 2608,
 2619,
 2641,
 2667,
 2670,
 2690,
 2709,
 2710,
 2742,
 2772,
 2802,
 2828,
 2845,
 2917,
 2924,
 2928,
 2930,
 2941,
 2975,
 2976,
 3001,
 3002,
 3065,
 3157,
 3169,
 3315,
 3322,
 3344,
 3413,
 3446,
 3447,
 3449,
 3480,
 3501,
 3502,
 3503,
 3551,
 3560,
 3576,
 3593,
 3594,
 3595,
 3596,
 3597,
 3598,
 3599,
 3600,
 3601,
 3602,
 3605,
 3606,
 3744,
 3745,
 3758,
 3839,
 3840]

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'], y=valid_anomalies['value'], 
                         mode='markers', name='Real Anomaly',
                         marker=dict(color='green', size=13)))
fig.add_trace(go.Scatter(x=valid['timestamp'][valid_anomalies_idxs],
                         y=valid['value'][valid_anomalies_idxs], 
                         mode='markers', name='Detected Anomaly',
                         marker=dict(color='red', size=7)))

In [None]:
def calculate_metrics(ground_truth: pd.DataFrame, anomalies_idxs: list):
    predictions = pd.DataFrame(index=range(len(ground_truth)), columns=['predicted_anomaly'])
    predictions['predicted_anomaly'] = 0
    predictions.iloc[anomalies_idxs] = 1
    
    confusion_matrix = pd.crosstab(ground_truth.loc[:, 'anomaly'], predictions['predicted_anomaly'], margins=True)
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth.loc[:, 'anomaly'], predictions['predicted_anomaly'], beta=2., average='binary'
    )
    return confusion_matrix, precision, recall, f1

In [None]:
train_conf_matrix, *train_metrics = calculate_metrics(
    train, train_anomalies_idxs
)
train_conf_matrix



predicted_anomaly,0,1,All
anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4028,2,4030
1,1,1,2
All,4029,3,4032


In [None]:
# Pretty printing of the metrics
print(f'Train:\n Precision: {train_metrics[0]:.3f}\n' 
      f'Recall: {train_metrics[1]:.3f}\n' 
      f'F2 score: {train_metrics[2]:.3f}')

Train:
 Precision: 0.333
Recall: 0.500
F2 score: 0.455


In [None]:
valid_conf_matrix, *valid_metrics = calculate_metrics(
    valid, valid_anomalies_idxs
)
valid_conf_matrix


predicted_anomaly,0,1,All
anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4029,1,4030
1,0,2,2
All,4029,3,4032


In [None]:
print(f'Valid:\n Precision: {valid_metrics[0]:.3f}\n' 
      f'Recall: {valid_metrics[1]:.3f}\n' 
      f'F2 score: {valid_metrics[2]:.3f}')

Valid:
 Precision: 0.667
Recall: 1.000
F2 score: 0.909


In [None]:
def detect_anomalies(df, errors, pred_error_thresholds):
    df['error'] = errors
    df['upper_bound'] = pred_error_thresholds
    # Here we have for each value its own threshold
    indices = df.index[df['error'] >= df['upper_bound']].values.tolist()
    indices = [i for i in indices]
    return indices

In [None]:
train_anomalies_dynamic_idxs = detect_anomalies(
    train, train_pred_errors, train_dynamic_threshold
)
train_anomalies_dynamic_idxs

[1649, 1937, 3080]

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=train['timestamp'], y=train['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=train_anomalies['timestamp'], y=train_anomalies['value'], 
                         mode='markers', name='Real Anomaly',
                         marker=dict(color='green', size=13)))
fig.add_trace(go.Scatter(x=train['timestamp'][train_anomalies_dynamic_idxs],
                         y=train['value'][train_anomalies_dynamic_idxs], 
                         mode='markers', name='Detected Anomaly',
                         marker=dict(color='red', size=7)))

In [None]:
valid_anomalies_dynamic_idxs = detect_anomalies(
    valid, valid_pred_errors, valid_dynamic_threshold
)
valid_anomalies_dynamic_idxs

[946, 2585, 3593]

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='CPU Utilization'))
fig = go.Figure(layout=layout)
fig.add_trace(go.Scatter(x=valid['timestamp'], y=valid['value'], 
                         mode='markers', name='Non-anomaly',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=valid_anomalies['timestamp'], y=valid_anomalies['value'], 
                         mode='markers', name='Real Anomaly',
                         marker=dict(color='green', size=13)))
fig.add_trace(go.Scatter(x=valid['timestamp'][valid_anomalies_dynamic_idxs],
                         y=valid['value'][valid_anomalies_dynamic_idxs], 
                         mode='markers', name='Detected Anomaly',
                         marker=dict(color='red', size=7)))

In [None]:
train_conf_matrix, *train_metrics = calculate_metrics(train, train_anomalies_dynamic_idxs)
train_conf_matrix

predicted_anomaly,0,1,All
anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4028,2,4030
1,1,1,2
All,4029,3,4032


In [None]:
print(f'Train:\n Precision: {train_metrics[0]:.3f}\n Recall: {train_metrics[1]:.3f}\n F1 score: {train_metrics[2]:.3f}')

Train:
 Precision: 0.333
 Recall: 0.500
 F1 score: 0.455


In [None]:
valid_conf_matrix, *valid_metrics = calculate_metrics(valid, valid_anomalies_dynamic_idxs)
valid_conf_matrix

predicted_anomaly,0,1,All
anomaly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4029,1,4030
1,0,2,2
All,4029,3,4032


In [None]:
print(f'Valid:\n Precision: {valid_metrics[0]:.3f}\n Recall: {valid_metrics[1]:.3f}\n F1 score: {valid_metrics[2]:.3f}')

Valid:
 Precision: 0.667
 Recall: 1.000
 F1 score: 0.909
