In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nab/README.md
/kaggle/input/nab/realKnownCause/realKnownCause/rogue_agent_key_updown.csv
/kaggle/input/nab/realKnownCause/realKnownCause/ec2_request_latency_system_failure.csv
/kaggle/input/nab/realKnownCause/realKnownCause/ambient_temperature_system_failure.csv
/kaggle/input/nab/realKnownCause/realKnownCause/nyc_taxi.csv
/kaggle/input/nab/realKnownCause/realKnownCause/rogue_agent_key_hold.csv
/kaggle/input/nab/realKnownCause/realKnownCause/machine_temperature_system_failure.csv
/kaggle/input/nab/realKnownCause/realKnownCause/cpu_utilization_asg_misconfiguration.csv
/kaggle/input/nab/realTraffic/realTraffic/TravelTime_387.csv
/kaggle/input/nab/realTraffic/realTraffic/speed_6005.csv
/kaggle/input/nab/realTraffic/realTraffic/speed_t4013.csv
/kaggle/input/nab/realTraffic/realTraffic/occupancy_t4013.csv
/kaggle/input/nab/realTraffic/realTraffic/speed_7578.csv
/kaggle/input/nab/realTraffic/realTraffic/occupancy_6005.csv
/kaggle/input/nab/realTraffic/realTraffic/TravelTime_451.

# 1. Importing relevant libraries

In [2]:
import plotly.express as px
import altair as alt
from sklearn.ensemble import IsolationForest
import plotly.graph_objects as go

# 2. Understanding the Data

In [3]:
cloudwatch_df = pd.read_csv("/kaggle/input/nab/realAWSCloudwatch/realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv")
cloudwatch_df.head()

Unnamed: 0,timestamp,value
0,2014-02-14 14:30:00,1.732
1,2014-02-14 14:35:00,1.732
2,2014-02-14 14:40:00,1.96
3,2014-02-14 14:45:00,1.732
4,2014-02-14 14:50:00,1.706


In [4]:
cloudwatch_df.shape

(4032, 2)

In [5]:
cloudwatch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4032 entries, 0 to 4031
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timestamp  4032 non-null   object 
 1   value      4032 non-null   float64
dtypes: float64(1), object(1)
memory usage: 63.1+ KB


In [6]:
cloudwatch_df.describe()

Unnamed: 0,value
count,4032.0
mean,1.829555
std,0.101458
min,1.604
25%,1.766
50%,1.8
75%,1.866
max,2.656


# 3. Preprocessing/ feature engineering

In [7]:
cloudwatch_df['timestamp'] = pd.to_datetime(cloudwatch_df['timestamp'])
cloudwatch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4032 entries, 0 to 4031
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  4032 non-null   datetime64[ns]
 1   value      4032 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 63.1 KB


In [8]:
cloudwatch_df['year'] = cloudwatch_df['timestamp'].apply(lambda x: x.year)
cloudwatch_df['month'] = cloudwatch_df['timestamp'].apply(lambda x: x.month)
cloudwatch_df['day'] = cloudwatch_df['timestamp'].apply(lambda x: x.day)
cloudwatch_df['weekday'] = cloudwatch_df['timestamp'].apply(lambda x: x.weekday())
cloudwatch_df['hour'] = cloudwatch_df['timestamp'].apply(lambda x: x.hour)

cloudwatch_df = cloudwatch_df[['timestamp', 'year', 'month', 'day', 'weekday', 'hour', 'value']]

# Weekday starts from Monday
print(f'{cloudwatch_df.timestamp[0]} with weekday {cloudwatch_df.weekday[0]} is {cloudwatch_df.timestamp[0].strftime("%A")}.\n')

cloudwatch_df.head()

2014-02-14 14:30:00 with weekday 4 is Friday.



Unnamed: 0,timestamp,year,month,day,weekday,hour,value
0,2014-02-14 14:30:00,2014,2,14,4,14,1.732
1,2014-02-14 14:35:00,2014,2,14,4,14,1.732
2,2014-02-14 14:40:00,2014,2,14,4,14,1.96
3,2014-02-14 14:45:00,2014,2,14,4,14,1.732
4,2014-02-14 14:50:00,2014,2,14,4,14,1.706


In [9]:
cloudwatch_df.describe()

Unnamed: 0,year,month,day,weekday,hour,value
count,4032.0,4032.0,4032.0,4032.0,4032.0,4032.0
mean,2014.0,2.0,21.104167,3.0,11.5,1.829555
std,0.0,0.0,4.061187,2.000248,6.923045,0.101458
min,2014.0,2.0,14.0,0.0,0.0,1.604
25%,2014.0,2.0,18.0,1.0,5.75,1.766
50%,2014.0,2.0,21.0,3.0,11.5,1.8
75%,2014.0,2.0,25.0,5.0,17.25,1.866
max,2014.0,2.0,28.0,6.0,23.0,2.656


# 4. Exploratory Data Analysis

In [10]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Overview of time series data')

fig.update_xaxes(rangeslider_visible=True,)
fig

In [11]:
cloudwatch_df.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,value
0,2014-02-14 14:30:00,2014,2,14,4,14,1.732
1,2014-02-14 14:35:00,2014,2,14,4,14,1.732
2,2014-02-14 14:40:00,2014,2,14,4,14,1.96
3,2014-02-14 14:45:00,2014,2,14,4,14,1.732
4,2014-02-14 14:50:00,2014,2,14,4,14,1.706


In [12]:
alt.Chart(cloudwatch_df).mark_rect().encode(alt.X('hour:O', title='hour of day'),
                                      alt.Y('weekday:O', title='weekday'),
                                      alt.Color('value:Q', title='CPU usage')).properties(
                                            width=800,
                                            height=300)

In [13]:
alt.Chart(cloudwatch_df).mark_bar().encode(x = 'weekday:O',
                                     y = 'value:Q').properties(width=600)

# 5. Unsupervised Models

## 5.1 Isolation Forests

In [14]:
x = cloudwatch_df['value'].apply(lambda x: [x]).to_list()

iso_forest = IsolationForest(n_estimators = 100, 
                        max_samples = "auto",
                        contamination = 0.01, 
                        random_state = 42)
iso_forest.fit(x)
y_pred = iso_forest.predict(x)
y_pred = [1 if x == -1 else 0 for x in y_pred]
y_pred[:10]
# Points that are 1 are outliers

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [15]:
cloudwatch_df["anomaly"] = y_pred
cloudwatch_df.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,value,anomaly
0,2014-02-14 14:30:00,2014,2,14,4,14,1.732,0
1,2014-02-14 14:35:00,2014,2,14,4,14,1.732,0
2,2014-02-14 14:40:00,2014,2,14,4,14,1.96,0
3,2014-02-14 14:45:00,2014,2,14,4,14,1.732,0
4,2014-02-14 14:50:00,2014,2,14,4,14,1.706,0


In [16]:
iso_anomaly_df = pd.DataFrame(cloudwatch_df)
iso_anomaly_df = iso_anomaly_df.loc[iso_anomaly_df['anomaly'] == 1]
iso_anomaly_df.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,value,anomaly
98,2014-02-14 22:40:00,2014,2,14,4,22,2.162,1
156,2014-02-15 03:30:00,2014,2,15,5,3,2.466,1
250,2014-02-15 11:20:00,2014,2,15,5,11,1.636,1
446,2014-02-16 03:40:00,2014,2,16,6,3,2.57,1
507,2014-02-16 08:45:00,2014,2,16,6,8,1.638,1


In [17]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=iso_anomaly_df["timestamp"].to_list(), y=iso_anomaly_df["value"].to_list(), mode='markers', name='anomalies'))
fig.update_xaxes(rangeslider_visible=True)
fig

Too bad we do not have labelled data to measure the amount of anomalies we manage to capture

## 5.2 Local Outlier Factor

In [18]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=2)
y_pred = lof.fit_predict(x)
y_pred = [1 if x == -1 else 0 for x in y_pred]
y_pred[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
cloudwatch_df["anomaly"] = y_pred
cloudwatch_df.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,value,anomaly
0,2014-02-14 14:30:00,2014,2,14,4,14,1.732,0
1,2014-02-14 14:35:00,2014,2,14,4,14,1.732,0
2,2014-02-14 14:40:00,2014,2,14,4,14,1.96,0
3,2014-02-14 14:45:00,2014,2,14,4,14,1.732,0
4,2014-02-14 14:50:00,2014,2,14,4,14,1.706,0


In [20]:
lof_anomaly_df = pd.DataFrame(cloudwatch_df)
lof_anomaly_df = lof_anomaly_df.loc[lof_anomaly_df['anomaly'] == 1]
lof_anomaly_df.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,value,anomaly
65,2014-02-14 19:55:00,2014,2,14,4,19,1.954,1
84,2014-02-14 21:30:00,2014,2,14,4,21,1.788,1
176,2014-02-15 05:10:00,2014,2,15,5,5,2.102,1
179,2014-02-15 05:25:00,2014,2,15,5,5,1.888,1
218,2014-02-15 08:40:00,2014,2,15,5,8,2.008,1


In [21]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=lof_anomaly_df["timestamp"].to_list(), y=lof_anomaly_df["value"].to_list(), mode='markers', name='anomalies'))
fig.update_xaxes(rangeslider_visible=True)
fig

# 6. Model comparison

In [22]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=lof_anomaly_df["timestamp"].to_list(), y=lof_anomaly_df["value"].to_list(), mode='markers', name='Local Outlier Factor'))
fig.add_trace(go.Scatter(x=iso_anomaly_df["timestamp"].to_list(), y=iso_anomaly_df["value"].to_list(), mode='markers', name='Isolation Forests'))
fig.update_xaxes(rangeslider_visible=True)
fig

Just based on the two visualizations, it is easy to see that Isolation Forest provides is more robust as a model for capturing anomalies. Isolation Forest also allows more flexibility by defining a contamination parameter when defining the model.