In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.offline as of

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
def AnomalyDetection(data):
    mean = pd.Series(data[60:]).mean()
    std = pd.Series(data[60:]).std()
    data = pd.Series(data).fillna(mean).values
    data[np.where(np.abs(data-mean) > 2*std)] = mean
    return data, mean, std

In [3]:
def clean(df_train, df_test, label):
    df_train_clean, df_test_clean = pd.DataFrame(), pd.DataFrame()
    df_train_clean['timestamp'] = df_train['timestamp'].values
    df_test_clean['timestamp'] = df_test['timestamp'].values
    
    cols = df_train.columns.values.tolist()
    cols.remove('timestamp')

    for col in cols:
        data_train = df_train[col].values
        data_train, mean_train, std_train = AnomalyDetection(data_train)
        
        mean_test = df_test[label['label']==0][col].mean()
        std_test = df_test[label['label']==0][col].std()
        data_test = df_test[col].fillna(mean_test).values
        data_test = (data_test - mean_test) / std_test * std_train + mean_train
        
        scaler = StandardScaler()
#         scaler.fit(np.vstack((data_train.reshape(-1, 1), data_test.reshape(-1, 1))))
        data_train = scaler.fit_transform(data_train.reshape(-1, 1))
        data_test = scaler.transform(data_test.reshape(-1, 1))
        
        df_train_clean[col] = data_train
        df_test_clean[col] = data_test
        
    return df_train_clean, df_test_clean

In [4]:
label = pd.read_csv('../dataset/processed/test/label.csv')
label['label'] = label['label'].replace(0, np.NaN)
label['label'] = label['label'].fillna('ffill', limit=5).fillna('bfill', limit=5).fillna(0)

In [5]:
df_train = pd.read_csv('../dataset/processed/train/metrics/metrics.csv')
df_test = pd.read_csv('../dataset/processed/test/metrics/metrics.csv')

df_train, df_test = clean(df_train, df_test, label)
df_train.to_csv('../dataset/processed/train/metrics/metrics_clean.csv', index=False)
df_test.to_csv('../dataset/processed/test/metrics/metrics_clean.csv', index=False)

In [6]:
lines = []
layout = go.Layout(title='Metrics_train')
for col in df_train.columns.values[1:]:
    line = go.Scatter(x=df_train['timestamp'].values, y=df_train[col].values, mode='lines', name=col)
    lines.append(line)
fig = go.Figure(lines, layout=layout)
of.plot(fig, filename='../dataset/processed/train/metrics/metrics_clean.html', auto_play=False, auto_open=False)

lines = []
layout = go.Layout(title='Metrics_test')
for col in df_test.columns.values[1:]:
    line = go.Scatter(x=df_test['timestamp'].values, y=df_test[col].values, mode='lines', name=col)
    lines.append(line)
fig = go.Figure(lines, layout=layout)
of.plot(fig, filename='../dataset/processed/test/metrics/metrics_clean.html', auto_play=False, auto_open=False)

'../dataset/processed/test/metrics/metrics_clean.html'

In [7]:
df_train = pd.read_csv('../dataset/processed/train/traces/traces.csv')
df_test = pd.read_csv('../dataset/processed/test/traces/traces.csv')

df_train, df_test = clean(df_train, df_test, label)
df_train.to_csv('../dataset/processed/train/traces/traces_clean.csv', index=False)
df_test.to_csv('../dataset/processed/test/traces/traces_clean.csv', index=False)

In [8]:
lines = []
layout = go.Layout(title='Traces_train')
for col in df_train.columns.values[1:]:
    line = go.Scatter(x=df_train['timestamp'].values, y=df_train[col].values, mode='lines', name=col)
    lines.append(line)
fig = go.Figure(lines, layout=layout)
of.plot(fig, filename='../dataset/processed/train/traces/traces_clean.html', auto_play=False, auto_open=False)

lines = []
layout = go.Layout(title='Traces_test')
for col in df_test.columns.values[1:]:
    line = go.Scatter(x=df_test['timestamp'].values, y=df_test[col].values, mode='lines', name=col)
    lines.append(line)
fig = go.Figure(lines, layout=layout)
of.plot(fig, filename='../dataset/processed/test/traces/traces_clean.html', auto_play=False, auto_open=False)

'../dataset/processed/test/traces/traces_clean.html'