In [1]:
import os
import tensorflow as tf
from model import Model
from input_pipe import InputPipe
from feeder import VarFeeder
from tqdm import trange
import matplotlib.pyplot as plt
import collections
import pandas as pd
import numpy as np
# from trainer import predict
from hparams import build_hparams
import hparams
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  from ._conv import register_converters as _register_converters


In [2]:
def smape(true, pred):
    summ = np.abs(true) + np.abs(pred)
    smape = np.where(summ == 0, 0, np.abs(true - pred) / summ)
    return smape

def mae(true, pred):
    return np.abs(np.abs(true) - np.abs(pred))

def mean_smape(true, pred):
    raw_smape = smape(true, pred)
    masked_smape = np.ma.array(raw_smape, mask=np.isnan(raw_smape))
    return masked_smape.mean()

def mean_mae(true, pred):
    raw_mae = mae(true, pred)
    masked_mae = np.ma.array(raw_mae, mask=np.isnan(raw_mae))
    return masked_mae.mean()

def predict_loss(prev, paths):
    # prev: true value
    # paths: paths to the model weights
    t_preds = []
    for tm in range(3):
        tf.reset_default_graph()
        t_preds.append(predict(paths[-1:], build_hparams(hparams.params_s32), back_offset=0, predict_window=288,
                        n_models=3, target_model=tm, seed=2, batch_size=50, asgd=True))
    preds=sum(t_preds) /3
    preds.index = [idx.decode('ascii') for idx in preds.index]
    # mean mae
    res = 0
    for idx in preds.index:
        res += np.abs(preds.loc[idx, :] - prev.loc[idx, -288:]).sum()
    res /= len(preds.index) * 288
    return preds, res

def split_data(df):
    bad_path = os.path.join('/nfs/project/lihaocheng/badcase', 'single_rnn_mae_beyond_1000_vm_uuids')
    bad_df = pd.DataFrame()
    normal_df = df.copy()
    with open(bad_path, 'r') as f:
        line = f.readline()
        while(line):
            line = line[:-1] + ".hdf5"
            if line in df.index:
                bad_df = bad_df.append(df.loc[line])
                normal_df = normal_df.drop(line)
            line = f.readline()
    return bad_df.sort_index(), normal_df.sort_index()

def show_single(preds, prev, vm, scope=288, bad_case=True):
    name = preds.index[vm]
    if bad_case:
        bad_path = os.path.join('/nfs/project/lihaocheng/badcase', 'single_rnn_mae_beyond_1000_vm_uuids')
        bad_list = []
        with open(bad_path, 'r') as f:
            line = f.readline()
            while(line):
                line = line[:-1] + ".hdf5"
                if line in preds.index:
                    bad_list.append(line)
                line = f.readline()
        name = bad_list[vm]
    
    # mean mae for each row
    print(f'vm name: {name}')
    prev.loc[name, ends[vm] - scope : ends[vm]].plot(logy=True)
    (preds.loc[name, :]).plot(logy=True)
    # mean loss
    print(mean_mae(prev.loc[name, ends[vm] - 288 : ends[vm]], preds.loc[name, :]))

In [3]:
from make_features import read_all, find_start_end
dfs = read_all('/nfs/isolation_project/intern/project/lihaocheng/vm/')
df = dfs[0].iloc[:, 0:-288]
df.sort_index()
starts, ends = find_start_end(df.values)
page_mask = (ends - starts) / df.shape[1] < 0.04
print("Masked %d vms from %d" % (page_mask.sum(), len(df)))
inv_mask = ~page_mask
df = dfs[0][inv_mask]
prev = df.apply(lambda x : np.exp(x) - 1)

Masked 630 vms from 4037


In [20]:
from feeder import VarFeeder
from input_pipe import InputPipe, ModelMode, FakeSplitter, vm_features
def predict(checkpoints, hparams, datadir="data", return_x=False, verbose=False, 
            predict_window=288, back_offset=0, n_models=1,
            target_model=0, asgd=False, seed=1, batch_size=1024):
    with tf.variable_scope('input') as inp_scope:
        with tf.device("/cpu:0"):
            inp = VarFeeder.read_vars(os.path.join(datadir, "vars"))
            pipe = InputPipe(datadir, inp, vm_features(inp), inp.n_vm, mode=ModelMode.PREDICT, batch_size=batch_size,
                             n_epoch=1, verbose=verbose,
                             train_completeness_threshold=0.01,
                             predict_window=predict_window,
                             predict_completeness_threshold=0.0, train_window=hparams.train_window,
                             back_offset=back_offset)
    asgd_decay = 0.99 if asgd else None
    if n_models == 1:
        model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay)
    else:
        models = []
        for i in range(n_models):
            prefix = f"m_{i}"
            with tf.variable_scope(prefix) as scope:
                models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix))
        model = models[target_model]

    if asgd:
        var_list = model.ema.variables_to_restore()
        prefix = f"m_{target_model}"
        for var in list(var_list.keys()):
            if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix):
                print(var)
                del var_list[var]
        print(var_list.keys())
    else:
        var_list = None
    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
    x_buffer = []
    predictions = None
    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
        pipe.load_vars(sess)
        for checkpoint in checkpoints:
            pred_buffer = []
            pipe.init_iterator(sess)
            saver.restore(sess, checkpoint)
            cnt = 0
            while True:
                try:
                    if return_x:
                        pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.vm_ix])
                    else:
                        pred, pname = sess.run([model.predictions, model.inp.vm_ix])
                    # utf_names = [str(name, 'utf-8') for name in pname]
                    utf_names = pname
                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred))
                    pred_buffer.append(pred_df)
                    if return_x:
                        # noinspection PyUnboundLocalVariable
                        x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
                        x_buffer.append(x_values)
                    newline = cnt % 80 == 0
                    if cnt > 0:
                        print('.', end='\n' if newline else '', flush=True)
                    if newline:
                        print(cnt, end='')
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    print('🎉')
                    break
            cp_predictions = pd.concat(pred_buffer)
            if predictions is None:
                predictions = cp_predictions
            else:
                predictions += cp_predictions
    predictions /= len(checkpoints)
    offset = back_offset
    start_prediction = inp.data_end + 1 - offset
    end_prediction = start_prediction + predict_window - 1
    predictions.columns = np.arange(start_prediction, end_prediction + 1)
    if return_x:
        x = pd.concat(x_buffer)
        start_data = inp.data_end - hparams.train_window - 1 - back_offset
        end_data = inp.data_end - back_offset
        x.columns = np.arange(start_data, end_data - 1)
        return predictions, x
    else:
        return predictions


In [21]:
paths = [p for p in tf.train.get_checkpoint_state(os.path.join('data/cpt', 's32')).all_model_checkpoint_paths]
preds, loss = predict_loss(prev, paths)
print(f'Mean MAE = {loss}\n........Generate csv for each csv..........')

m_2/rnn/gru_cell/candidate/bias/ExponentialMovingAverage
m_2/decoder/gru_cell/w_ru/ExponentialMovingAverage
m_2/rnn/gru_cell/gates/kernel/ExponentialMovingAverage
m_2/decoder/gru_cell/b_c/ExponentialMovingAverage
m_2/rnn/gru_cell/gates/bias/ExponentialMovingAverage
m_2/embedding/fc1/ExponentialMovingAverage
m_1/decoder/decoder_output_proj/kernel/ExponentialMovingAverage
m_2/decoder/decoder_output_proj/kernel/ExponentialMovingAverage
m_2/rnn/gru_cell/candidate/kernel/ExponentialMovingAverage
m_2/decoder/gru_cell/w_c/ExponentialMovingAverage
m_1/rnn/gru_cell/candidate/bias/ExponentialMovingAverage
m_1/embedding/BatchNorm/beta/ExponentialMovingAverage
m_1/rnn/gru_cell/gates/kernel/ExponentialMovingAverage
m_2/embedding/BatchNorm/beta/ExponentialMovingAverage
m_1/decoder/decoder_output_proj/bias/ExponentialMovingAverage
m_1/decoder/gru_cell/b_c/ExponentialMovingAverage
m_2/decoder/decoder_output_proj/bias/ExponentialMovingAverage
m_1/decoder/gru_cell/w_c/ExponentialMovingAverage
m_2/decode

InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [316,552] rhs shape= [300,552]
	 [[Node: eval_saver/Assign_15 = Assign[T=DT_FLOAT, _class=["loc:@m_1/rnn/gru_cell/gates/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](m_1/rnn/gru_cell/gates/kernel, eval_saver/RestoreV2/_23)]]

Caused by op 'eval_saver/Assign_15', defined at:
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
    handle._run()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-21-a4600599f853>", line 2, in <module>
    preds, loss = predict_loss(prev, paths)
  File "<ipython-input-2-f5e5da08bacb>", line 26, in predict_loss
    n_models=3, target_model=tm, seed=2, batch_size=50, asgd=True))
  File "<ipython-input-20-297eab751fdb>", line 36, in predict
    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1338, in __init__
    self.build()
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1347, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1384, in _build
    build_save=build_save, build_restore=build_restore)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 835, in _build_internal
    restore_sequentially, reshape)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 494, in _AddRestoreOps
    assign_ops.append(saveable.restore(saveable_tensors, shapes))
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 185, in restore
    self.op.get_shape().is_fully_defined())
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 283, in assign
    validate_shape=validate_shape)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
    use_locking=use_locking, name=name)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/lihaocheng_i/miniconda2/envs/py3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Assign requires shapes of both tensors to match. lhs shape= [316,552] rhs shape= [300,552]
	 [[Node: eval_saver/Assign_15 = Assign[T=DT_FLOAT, _class=["loc:@m_1/rnn/gru_cell/gates/kernel"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](m_1/rnn/gru_cell/gates/kernel, eval_saver/RestoreV2/_23)]]


Visual sanity check

In [None]:
show_single(preds, prev, 110, 1288)