[Reference](https://towardsdatascience.com/time-series-anomaly-detection-with-pycaret-706a6e2b2427)

In [1]:
# install slim version (default)
!pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/da/99/18f151991b0f06107af9723417c64e304ae2133587f85ea734a90136b4ae/pycaret-2.3.1-py3-none-any.whl (261kB)
[K     |████████████████████████████████| 266kB 5.2MB/s 
[?25hCollecting pyod
[?25l  Downloading https://files.pythonhosted.org/packages/81/dd/1cef53031ad8926df628e78d7cc9d57bb2aee62bc2f66215b2bef4deae64/pyod-0.8.9.tar.gz (104kB)
[K     |████████████████████████████████| 112kB 40.7MB/s 
Collecting umap-learn
[?25l  Downloading https://files.pythonhosted.org/packages/75/69/85e7f950bb75792ad5d666d86c5f3e62eedbb942848e7e3126513af9999c/umap-learn-0.5.1.tar.gz (80kB)
[K     |████████████████████████████████| 81kB 9.6MB/s 
[?25hCollecting imbalanced-learn==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/c8/81/8db4d87b03b998fda7c6f835d807c9ae4e3b141f978597b8d7f31600be15/imbalanced_learn-0.7.0-py3-none-any.whl (167kB)
[K     |████████████████████████████████| 174kB 36.1MB/s 
Collecting 

In [2]:
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/nyc_taxi.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [3]:
data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [4]:
!pip install plotly --upgrade

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/95/8d/ac1560f7ccc2ace85cd1e9619bbec1975b5d2d92e6c6fdbbdaa994c6ab4d/plotly-5.1.0-py2.py3-none-any.whl (20.6MB)
[K     |████████████████████████████████| 20.6MB 56.9MB/s 
[?25hCollecting tenacity>=6.2.0
  Downloading https://files.pythonhosted.org/packages/41/ee/d6eddff86161c6a3a1753af4a66b06cbc508d3b77ca4698cd0374cd66531/tenacity-7.0.0-py2.py3-none-any.whl
Installing collected packages: tenacity, plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-5.1.0 tenacity-7.0.0


In [5]:
# create moving-averages
data['MA48'] = data['value'].rolling(48).mean()
data['MA336'] = data['value'].rolling(336).mean()

# plot 
import plotly.express as px
fig = px.line(data, x="timestamp", y=['value', 'MA48', 'MA336'], title='NYC Taxi Trips', template = 'plotly_dark')
fig.show()

  defaults = yaml.load(f)


In [6]:
# drop moving-average columns
data.drop(['MA48', 'MA336'], axis=1, inplace=True)

# set timestamp to index
data.set_index('timestamp', drop=True, inplace=True)

# resample timeseries to hourly 
data = data.resample('H').sum()

# creature features from date
data['day'] = [i.day for i in data.index]
data['day_name'] = [i.day_name() for i in data.index]
data['day_of_year'] = [i.dayofyear for i in data.index]
data['week_of_year'] = [i.weekofyear for i in data.index]
data['hour'] = [i.hour for i in data.index]
data['is_weekday'] = [i.isoweekday() for i in data.index]
data.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-07-01 00:00:00,18971,1,Tuesday,182,27,0,2
2014-07-01 01:00:00,10866,1,Tuesday,182,27,1,2
2014-07-01 02:00:00,6693,1,Tuesday,182,27,2,2
2014-07-01 03:00:00,4433,1,Tuesday,182,27,3,2
2014-07-01 04:00:00,4379,1,Tuesday,182,27,4,2


In [7]:
# init setup
from pycaret.anomaly import *
s = setup(data, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(5160, 7)"
2,Missing Values,False
3,Numeric Features,5
4,Categorical Features,2
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(5160, 19)"
9,CPU Jobs,-1


In [8]:
# check list of available models
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [9]:
# train model
iforest = create_model('iforest', fraction = 0.1)
iforest_results = assign_model(iforest)
iforest_results.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-07-01 00:00:00,18971,1,Tuesday,182,27,0,2,0,-0.01545
2014-07-01 01:00:00,10866,1,Tuesday,182,27,1,2,0,-0.006367
2014-07-01 02:00:00,6693,1,Tuesday,182,27,2,2,0,-0.010988
2014-07-01 03:00:00,4433,1,Tuesday,182,27,3,2,0,-0.017091
2014-07-01 04:00:00,4379,1,Tuesday,182,27,4,2,0,-0.017006


In [10]:
# check anomalies
iforest_results[iforest_results['Anomaly'] == 1].head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-07-13,50825,13,Sunday,194,28,0,7,1,0.002663
2014-07-27,50407,27,Sunday,208,30,0,7,1,0.009264
2014-08-03,48081,3,Sunday,215,31,0,7,1,0.003045
2014-09-28,53589,28,Sunday,271,39,0,7,1,0.00444
2014-10-05,48472,5,Sunday,278,40,0,7,1,0.000325


In [11]:
import plotly.graph_objects as go

# plot value on y-axis and date on x-axis
fig = px.line(iforest_results, x=iforest_results.index, y="value", title='NYC TAXI TRIPS - UNSUPERVISED ANOMALY DETECTION', template = 'plotly_dark')

# create list of outlier_dates
outlier_dates = iforest_results[iforest_results['Anomaly'] == 1].index


# obtain y value of anomalies to plot
y_values = [iforest_results.loc[i]['value'] for i in outlier_dates]

fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers', 
                name = 'Anomaly', 
                marker=dict(color='red',size=10)))
        
fig.show()