In [218]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import GRU
from keras.layers import Dense
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
import os

def concat(dfs, concat_axis):
    return pd.concat(dfs, axis=concat_axis)

def repeatdata(start, times, df):
    return df.loc[df.index.repeat(times)].reset_index(drop=True)

In [219]:
shift = 4

# Params
data_directory = 'data'
dataset_files = ['cpu_parameters', 'disk_io_parameters', 'jvm_parameters', 'memory_parameters', 'network_io_parameters']
param_to_drop = []
model_type = 'gru'
model_epoches = 200
model_batch_size = 200

cols = ['@timestamp','system.cpu.user.pct', 'system.cpu.system.pct',
       'system.cpu.idle.pct', 'system.cpu.iowait.pct',
       'system.cpu.softirq.pct', 'system.cpu.total.pct',
       'system.memory.used.pct',
       'system.network.in.bytes', 'system.network.in.packets',
       'system.network.in.dropped', 'system.network.out.bytes',
       'system.network.out.packets', 'system.network.out.errors',
       'system.diskio.iostat.await',
       'system.diskio.iostat.queue.avg_size',
       'system.diskio.iostat.read.per_sec.bytes',
       'system.diskio.iostat.write.per_sec.bytes',
       'jvm.metrics.memory.heap_memory_usage.committed',
       'jvm.metrics.memory.heap_memory_usage.max',
       'jvm.metrics.memory.heap_memory_usage.used',
       'jvm.metrics.threading.thread_count',
       'jvm.metrics.gc.psms.collection_count',
       'jvm.metrics.gc.psms.collection_time',
       'jvm.metrics.gc.pss.collection_count',
       'jvm.metrics.gc.pss.collection_time', 'label']

In [220]:
cpu = pd.read_csv(data_directory+'/cpu_parameters.csv', header=0)
memory = pd.read_csv(data_directory+'/memory_parameters.csv', header=0).drop(["@timestamp", "label"], axis=1)
network = pd.read_csv(data_directory+'/network_io_parameters.csv', header=0).drop(["@timestamp", "label"], axis=1)
diskio = repeatdata(0, 6, pd.read_csv(data_directory+'/disk_io_parameters.csv', header=0)).drop(["@timestamp", "label"], axis=1)
jvm = repeatdata(0, 6, pd.read_csv(data_directory+'/jvm_parameters.csv', header=0)).drop(["@timestamp", "label"], axis=1)

dataset = pd.concat([cpu, memory, network, diskio, jvm], 1)
dataset = dataset[cols]
dataset.head()

Unnamed: 0,@timestamp,system.cpu.user.pct,system.cpu.system.pct,system.cpu.idle.pct,system.cpu.iowait.pct,system.cpu.softirq.pct,system.cpu.total.pct,system.memory.used.pct,system.network.in.bytes,system.network.in.packets,...,system.diskio.iostat.write.per_sec.bytes,jvm.metrics.memory.heap_memory_usage.committed,jvm.metrics.memory.heap_memory_usage.max,jvm.metrics.memory.heap_memory_usage.used,jvm.metrics.threading.thread_count,jvm.metrics.gc.psms.collection_count,jvm.metrics.gc.psms.collection_time,jvm.metrics.gc.pss.collection_count,jvm.metrics.gc.pss.collection_time,label
0,2018-01-01 00:00:00,0.711,0.0675,3.1702,0.0302,0.0211,0.8298,0.8429,3000000.0,13038.0,...,50516.0,1959264000.0,1959264000.0,536266920.0,961.0,0,0,2,125,0
1,2018-01-01 00:00:10,0.6695,0.0633,3.241,0.0101,0.0161,0.759,0.8471,3000000.0,13178.0,...,50516.0,1959264000.0,1959264000.0,536266920.0,961.0,0,0,2,125,0
2,2018-01-01 00:00:20,0.5851,0.0755,3.3162,0.002,0.0211,0.6838,0.8489,4000000.0,12945.0,...,50516.0,1959264000.0,1959264000.0,536266920.0,961.0,0,0,2,125,0
3,2018-01-01 00:00:30,0.535,0.0836,3.3552,0.005,0.0212,0.6448,0.8547,3000000.0,13850.0,...,50516.0,1959264000.0,1959264000.0,536266920.0,961.0,0,0,2,125,0
4,2018-01-01 00:00:40,0.6239,0.0584,3.2926,0.004,0.0211,0.7074,0.8558,3000000.0,12838.0,...,50516.0,1959264000.0,1959264000.0,536266920.0,961.0,0,0,2,125,0


## Set Multiple Model Parameters

In [221]:
model_epoches = 200
model_batch_size = 200

## Shifting the dataset
shifting only one minute

In [222]:
shift_param = shift*(-6)
dataset_shifted = dataset.shift(shift_param)
dataset = dataset.iloc[:shift_param]
dataset_shifted = dataset_shifted.iloc[:shift_param]

## Spliting the dataset 
Ratio - 80 : 20

In [223]:
train = dataset.shape[0] * 2 // 10
    
train_dataset = dataset.iloc[:-train]
train_dataset_shifted = dataset_shifted.iloc[:-train]

test_dataset = dataset.iloc[-train:]
test_dataset_shifted = dataset_shifted.iloc[-train:]

In [224]:
test_dataset.to_csv(data_directory+'/test_dataset.csv')
test_dataset.drop(["@timestamp", "label"], axis=1, inplace=True)
train_dataset.drop(["@timestamp", "label"], axis=1, inplace=True)
train_dataset_shifted.drop(["@timestamp", "label"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## Normalizing Data

In [225]:
train_dataset[train_dataset.columns] = StandardScaler().fit_transform(train_dataset)
train_dataset_shifted[train_dataset_shifted.columns] = StandardScaler().fit_transform(train_dataset_shifted)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [226]:
train_dataset = train_dataset.values.reshape((train_dataset.shape[0], 1, train_dataset.shape[1]))
test_dataset_shaped = test_dataset.values.reshape((test_dataset.shape[0], 1, test_dataset.shape[1]))
print(train_dataset_shifted.shape)

(57572, 25)


## Training Models

### Feature Set 1
- system.cpu.iowait.pct'
- system.diskio_sda.iostat.await'
- system.diskio_sda.iostat.busy'
- system.diskio_sda.iostat.queue.avg_size'
- system.diskio_sda.iostat.read.request.merges_per_sec'
- system.diskio_sda.iostat.read.request.per_sec'
- system.diskio_sda.iostat.request.avg_size'
- system.diskio_sda.iostat.service_time'
- system.diskio_sda.iostat.write.request.merges_per_sec'
- system.diskio_sda.iostat.write.request.per_sec'
- system.diskio_sda2.iostat.await'
- system.diskio_sda2.iostat.busy'
- system.diskio_sda2.iostat.queue.avg_size'
- system.diskio_sda2.iostat.read.request.merges_per_sec'
- system.diskio_sda2.iostat.read.request.per_sec'
- system.diskio_sda2.iostat.request.avg_size'
- system.diskio_sda2.iostat.service_time'
- system.diskio_sda2.iostat.write.request.merges_per_sec'
- system.diskio_sda2.iostat.write.request.per_sec'

In [227]:
train_dataset_shifted_1 = train_dataset_shifted[[
    'system.cpu.user.pct',
    'system.cpu.system.pct',
    'system.cpu.idle.pct',
    'system.cpu.iowait.pct',
    'system.cpu.softirq.pct',
    'system.cpu.total.pct'
]]
train_dataset_shifted_1.head()

Unnamed: 0,system.cpu.user.pct,system.cpu.system.pct,system.cpu.idle.pct,system.cpu.iowait.pct,system.cpu.softirq.pct,system.cpu.total.pct
0,-0.215992,0.134323,0.210056,-0.236885,-0.733666,-0.210056
1,-0.219927,-0.168956,0.246287,-0.121152,-0.70595,-0.246287
2,-0.223487,-0.130272,0.247211,-0.236885,-0.456512,-0.247211
3,-0.169522,-0.034337,0.184177,-0.291995,-0.456512,-0.184177
4,-0.257777,-0.266438,0.302667,-0.291995,-1.010819,-0.302667


In [228]:
model_lstm_1 = Sequential()
model_lstm_1.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_1.add(Dense(6))
model_lstm_1.compile(loss='mse', optimizer='adam')

# history = model_lstm_1.fit(train_dataset, train_dataset_shifted_1, verbose=2, shuffle=False, epochs=model_epoches, validation_data=(test_dataset_shaped ,test_dataset_shifted_1), batch_size= 200)
history = model_lstm_1.fit(train_dataset, train_dataset_shifted_1, verbose=2, shuffle=False, epochs=model_epoches)

Epoch 1/200
 - 11s - loss: 0.7694
Epoch 2/200
 - 8s - loss: 0.5082
Epoch 3/200
 - 7s - loss: 0.4011
Epoch 4/200
 - 7s - loss: 0.3602
Epoch 5/200
 - 7s - loss: 0.3434
Epoch 6/200
 - 8s - loss: 0.3361
Epoch 7/200
 - 9s - loss: 0.3331
Epoch 8/200
 - 9s - loss: 0.3295
Epoch 9/200
 - 8s - loss: 0.3279
Epoch 10/200
 - 9s - loss: 0.3274
Epoch 11/200
 - 9s - loss: 0.3272
Epoch 12/200
 - 8s - loss: 0.3259
Epoch 13/200
 - 7s - loss: 0.3258
Epoch 14/200
 - 7s - loss: 0.3254
Epoch 15/200
 - 7s - loss: 0.3253
Epoch 16/200
 - 7s - loss: 0.3250
Epoch 17/200
 - 7s - loss: 0.3249
Epoch 18/200
 - 7s - loss: 0.3242
Epoch 19/200
 - 7s - loss: 0.3242
Epoch 20/200
 - 7s - loss: 0.3237
Epoch 21/200
 - 7s - loss: 0.3234
Epoch 22/200
 - 7s - loss: 0.3232
Epoch 23/200
 - 7s - loss: 0.3229
Epoch 24/200
 - 7s - loss: 0.3228
Epoch 25/200
 - 7s - loss: 0.3226
Epoch 26/200
 - 7s - loss: 0.3224
Epoch 27/200
 - 8s - loss: 0.3222
Epoch 28/200
 - 8s - loss: 0.3221
Epoch 29/200
 - 8s - loss: 0.3219
Epoch 30/200
 - 8s - l

### Feature Set 2
- system.cpu.total.pct'
- system.cpu.user.pct'

In [None]:
train_dataset_shifted_2 = train_dataset_shifted[[
    'system.network.in.bytes',
    'system.network.out.bytes',
    'system.network.in.dropped',
    'system.network.out.errors'  
]]

train_dataset_shifted_2.head()

Unnamed: 0,system.network.in.bytes,system.network.out.bytes,system.network.in.dropped,system.network.out.errors
0,0.242874,0.43252,0.192984,0.265885
1,0.242874,0.43252,0.192984,0.651874
2,0.242874,0.43252,0.192984,-0.506092
3,0.242874,-0.907109,-0.999641,1.809841
4,0.242874,0.43252,-0.999641,-0.120104


In [None]:
model_lstm_2 = Sequential()
# model_lstm_2.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_2.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_2.add(Dense(4))
model_lstm_2.compile(loss='mse', optimizer='adam')

model_lstm_2.fit(train_dataset, train_dataset_shifted_2, verbose=2, shuffle=False, epochs=model_epoches)

Epoch 1/200
 - 12s - loss: 0.9172
Epoch 2/200
 - 9s - loss: 0.6865
Epoch 3/200
 - 9s - loss: 0.6076
Epoch 4/200
 - 8s - loss: 0.5869
Epoch 5/200
 - 7s - loss: 0.5805
Epoch 6/200
 - 8s - loss: 0.5773
Epoch 7/200
 - 8s - loss: 0.5753
Epoch 8/200
 - 7s - loss: 0.5740
Epoch 9/200
 - 8s - loss: 0.5730
Epoch 10/200
 - 7s - loss: 0.5722
Epoch 11/200
 - 7s - loss: 0.5715
Epoch 12/200
 - 7s - loss: 0.5709
Epoch 13/200
 - 8s - loss: 0.5704
Epoch 14/200
 - 8s - loss: 0.5698
Epoch 15/200
 - 7s - loss: 0.5694
Epoch 16/200
 - 7s - loss: 0.5690
Epoch 17/200
 - 7s - loss: 0.5687
Epoch 18/200
 - 8s - loss: 0.5682
Epoch 19/200
 - 7s - loss: 0.5679
Epoch 20/200
 - 7s - loss: 0.5675
Epoch 21/200
 - 8s - loss: 0.5671
Epoch 22/200
 - 8s - loss: 0.5667
Epoch 23/200
 - 7s - loss: 0.5664
Epoch 24/200
 - 8s - loss: 0.5661
Epoch 25/200
 - 9s - loss: 0.5658
Epoch 26/200
 - 9s - loss: 0.5655
Epoch 27/200
 - 9s - loss: 0.5652
Epoch 28/200
 - 9s - loss: 0.5649
Epoch 29/200
 - 9s - loss: 0.5646
Epoch 30/200
 - 9s - l

### Feature Set 3
- jolokia.metrics.memory.heap_memory_usage.committed'
- jolokia.metrics.memory.heap_memory_usage.max'

In [None]:
train_dataset_shifted_3 = train_dataset_shifted[[
    'system.network.in.packets',
    'system.network.out.packets'
]]

train_dataset_shifted_3.head()

In [None]:
model_lstm_3 = Sequential()
# model_lstm_3.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_3.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_3.add(Dense(2))
model_lstm_3.compile(loss='mse', optimizer='adam')

model_lstm_3.fit(train_dataset, train_dataset_shifted_3, verbose=2, shuffle=False, epochs=model_epoches)

### Feature Set 4
- system.load.1'
- system.load.15'
- system.load.5'
- system.load.norm.1'
- system.load.norm.15'
- system.load.norm.5'

In [None]:
train_dataset_shifted_4 = train_dataset_shifted[[
    'system.diskio.iostat.await',
    'system.diskio.iostat.queue.avg_size',
    'system.diskio.iostat.read.per_sec.bytes',
    'system.diskio.iostat.write.per_sec.bytes'
]]

train_dataset_shifted_4.head()

In [None]:
model_lstm_4 = Sequential()
# model_lstm_4.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_4.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_4.add(Dense(4))
model_lstm_4.compile(loss='mse', optimizer='adam')

model_lstm_4.fit(train_dataset, train_dataset_shifted_4, verbose=2, shuffle=False, epochs=model_epoches)

### Feature Set 5
- system.cpu.idle.pct'
- system.cpu.softirq.pct'
- system.cpu.system.pct'
- jolokia.metrics.memory.heap_memory_usage.used'
- jolokia.metrics.memory.non_heap_memory_usage.used'
- jolokia.metrics.threading.daemon_thread_count'
- jolokia.metrics.threading.thread_count'
- system.memory.actual.used.pct'
- system.memory.swap.used.pct'
- system.memory.used.pct'

In [None]:
train_dataset_shifted_5 = train_dataset_shifted[[
    'jvm.metrics.memory.heap_memory_usage.committed',
    'jvm.metrics.memory.heap_memory_usage.max'
]]

train_dataset_shifted_5.head()

In [None]:
model_lstm_5 = Sequential()
# model_lstm_5.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_5.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_5.add(Dense(2))
model_lstm_5.compile(loss='mse', optimizer='adam')

model_lstm_5.fit(train_dataset, train_dataset_shifted_5, verbose=2, shuffle=False, epochs=model_epoches)

In [None]:
train_dataset_shifted_6 = train_dataset_shifted[[
    'jvm.metrics.memory.heap_memory_usage.used',
    'jvm.metrics.gc.psms.collection_count',
    'jvm.metrics.gc.psms.collection_time',
    'system.memory.used.pct']]

train_dataset_shifted_6.head()

In [None]:
model_lstm_6 = Sequential()
# model_lstm_6.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_6.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_6.add(Dense(4))
model_lstm_6.compile(loss='mse', optimizer='adam')

model_lstm_6.fit(train_dataset, train_dataset_shifted_6, verbose=2, shuffle=False, epochs=model_epoches)

In [None]:
train_dataset_shifted_7 = train_dataset_shifted[[
    'jvm.metrics.gc.pss.collection_count',
    'jvm.metrics.gc.pss.collection_time',
    'jvm.metrics.threading.thread_count'
]]

train_dataset_shifted_7.head()

In [None]:
model_lstm_7 = Sequential()
# model_lstm_7.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1]), activation='softplus', recurrent_activation='linear'))
model_lstm_7.add(GRU(10, input_shape=(1, train_dataset_shifted.shape[1])))
model_lstm_7.add(Dense(3))
model_lstm_7.compile(loss='mse', optimizer='adam')

model_lstm_7.fit(train_dataset, train_dataset_shifted_7, verbose=2, shuffle=False, epochs=model_epoches)

## Serializing Models

In [None]:
directory = data_directory+"/models/shift-"+str(shift)
models = [model_lstm_1, model_lstm_2, model_lstm_3, model_lstm_4, model_lstm_5, model_lstm_6, model_lstm_7]

if not os.path.exists(directory):
    os.makedirs(directory)

count = 0
for model_to_save in models:
    count += 1
    model_json = model_to_save.to_json()
    with open(directory+"/model_"+ model_type +"_"+str(count)+".json", "w") as json_file:
        json_file.write(model_json)

        # serialize weights to HDF5
    model_to_save.save_weights(directory+"/model_gru_"+str(count)+".h5")

print("Saved models to disk")