<a href="https://www.kaggle.com/code/drath10/outlier-detection-in-time-series-data-using-lstm?scriptVersionId=135059638" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import math
import numpy as np
import pandas_datareader as web
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout,RepeatVector, TimeDistributed
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy.stats as stats
from sklearn.metrics import f1_score



In [2]:
csv_name='/kaggle/input/nab/realAdExchange/realAdExchange/exchange-4_cpc_results.csv'
df = pd.read_csv(csv_name)
df.dropna(inplace=True)

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['value'], name='Value'))
fig.update_layout(showlegend=True, title=csv_name)
fig.show()

anomaly_points=[
        [
            "2014-04-10 16:15:00.000000",
            "2014-04-12 01:45:00.000000"
        ]
    ]
df['ranomaly'] = 0
for start, end in anomaly_points:
  df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'ranomaly'] = 1

prec=[]
data = df.filter(['value'])
dataset = data.values
training_data_len = math.ceil(0.5 * len(dataset))




In [3]:
df.head()

Unnamed: 0,timestamp,value,ranomaly
0,2011-07-01 00:15:01,0.091795,0
1,2011-07-01 01:15:01,0.074414,0
2,2011-07-01 02:15:01,0.056984,0
3,2011-07-01 03:15:01,0.071225,0
4,2011-07-01 04:15:01,0.045466,0


In [4]:
scaler = StandardScaler() 
scaled_data = scaler.fit_transform(dataset)

train_data = scaled_data[0:training_data_len]
x_train=[]
y_train = []
Past=150
for i in range(Past,len(train_data)):
  x_train.append(train_data[i-Past:i,0])
  y_train.append(train_data[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

In [5]:
test_data = scaled_data[training_data_len-Past: , : ]
x_test = []
y_test =  scaled_data[training_data_len : , : ]
for i in range(Past,len(test_data)):
  x_test.append(test_data[i-Past:i,0])

x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1))

In [6]:
model = Sequential()
model.add(LSTM(128, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dropout(rate=0.2))
model.add(RepeatVector(x_train.shape[1]))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(TimeDistributed(Dense(x_train.shape[2])))
model.compile(optimizer='adam', loss='mean_squared_error')
fname = "weights/"+csv_name+'.hdf5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(fname, monitor="val_loss", mode="min",
save_best_only=True, verbose=0)
callbacks = [checkpoint]

In [7]:
model.fit(x_train, y_train, batch_size=64, epochs=20,validation_split=0.1,
                callbacks=callbacks, shuffle=False, verbose=0)

<keras.callbacks.History at 0x7f146c0e9ae0>

In [8]:
model.load_weights(fname)

In [9]:
x_train_pred = model.predict(x_train, verbose=0)
x_train_pred=np.reshape(x_train_pred,(x_train_pred.shape[0],x_train_pred.shape[1]))
# train_mae_loss = np.mean(np.abs(x_train_pred - x_train[:,0])/np.std(x_train_pred), axis=1)
train_mae_loss = (np.mean(np.abs(x_train_pred - x_train[:,0]), axis=1))


In [10]:
threshold = 3*np.min(train_mae_loss)
x_test_pred = model.predict(x_test, verbose=0)
x_test_pred=np.reshape(x_test_pred,(x_test_pred.shape[0],x_test_pred.shape[1]))
test_mae_loss = np.mean(np.abs(x_test_pred-x_test[:,0]), axis=1)

In [11]:
test_score_df = pd.DataFrame(df[training_data_len:])
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = threshold
test_score_df['thresholdmax'] = threshold+0.5
test_score_df['anomaly'] = ((test_score_df['loss'] <=threshold))

for i in test_score_df.index:
    if test_score_df['anomaly'][i]:
      test_score_df['anomaly'][i-Past] = True
      test_score_df['anomaly'][i] = False

anomalies = test_score_df.loc[(test_score_df['anomaly'] == True)]
anomalies.shape
ranomal=test_score_df.loc[(test_score_df['ranomaly'] == True)]
#precision=np.count(np.where(test_score_df['ranomaly'] and test_score_df['anomaly']))/(np.count(np.where(test_score_df['ranomaly'] and test_score_df['anomaly']))+np.count(np.where(not test_score_df['ranomaly'] and test_score_df['anomaly'])))
score = f1_score(test_score_df['ranomaly'], test_score_df['anomaly'], average="binary")
prec.append(score)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_score_df.index, y=test_score_df['loss'], name='Test loss'))
fig.add_trace(go.Scatter(x=test_score_df.index, y=test_score_df['threshold'], name='Threshold'))
#fig.add_trace(go.Scatter(x=test_score_df.index, y=test_score_df['thresholdmax'], name='Threshold'))

fig.update_layout(showlegend=True, title='Test loss vs. Threshold')
fig.show()

In [13]:
fig_an = go.Figure()
fig_an.add_trace(go.Scatter(x=test_score_df.index, y=test_score_df['value'], name='Value'))
fig_an.add_trace(go.Scatter(x=anomalies.index, y=anomalies['value'], mode='markers', name='Anomaly'))
#fig_an.add_trace(go.Scatter(x=ranomal.index, y=ranomal['value'], mode='markers', name='Real_Anomaly'))
fig_an.update_layout(showlegend=True, title='Detected anomalies-'+csv_name)
fig_an.show()

In [14]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['value'], name='Value'))
fig.update_layout(showlegend=True, title='')
fig.show()