In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime
import glob
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model, Model
from keras import Model, layers
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Input, Flatten, MaxPooling1D, GlobalAveragePooling1D, Reshape, Conv2D, MaxPool2D, Concatenate
import random
import logging
from importlib import reload
path="data/"

# Load Data

In [5]:
data = pd.read_csv(f"{path}train.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                                dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                                       "mm:": int, "dd": int}, memory_map=True)
print(len(data))

# Tokenize input SQL statements
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data["statement"])

# Pad input sequences
max_len = 512
sequences = tokenizer.texts_to_sequences(data["statement"])
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Log regression data
min_runtime = min(data["runtime"])
min_resultsize = min(data["resultsize"])

data_runtime = np.log(data["runtime"] + 1 - min_runtime)
data_resultsize = np.log(data["resultsize"] + 1 - min_resultsize)

24414


# Load Model

## CNN1 Full

In [None]:
log_dir = "runs/CNN1Full/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 32
num_filters = 32
filter_size = 9
hidden_units = 128
EPOCHS=200
BATCH_SIZE = 1024
OPTIMIZER = "adamax"

il = Input(shape=(max_len,))
el = Embedding(vocab_size, embedding_dim, input_length=max_len)(il)
cl = Conv1D(num_filters, filter_size, activation='relu')(el)
pl = GlobalMaxPooling1D()(cl)
hl = Dense(hidden_units, activation='relu')(pl)
outTime = Dense(1, name="outTime")(hl)
outSize = Dense(1, name="outSize")(hl)
model = Model(inputs=il, outputs=[outTime, outSize])
print(model.summary())
model.compile(optimizer=OPTIMIZER, loss={"outTime": 'huber', "outSize": "huber"}, metrics={'outTime': ['mae','mse'], 'outSize':['mae','mse']})

## CNN1 Single

In [None]:
log_dir = "runs/CNN1Single/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 32
num_filters = 32
filter_size = 9
hidden_units = 128
EPOCHS=200
BATCH_SIZE = 1024
OPTIMIZER = "adamax"

il = Input(shape=(max_len,))
el = Embedding(vocab_size, embedding_dim, input_length=max_len)(il)
cl = Conv1D(num_filters, filter_size, activation='relu')(el)
pl = GlobalMaxPooling1D()(cl)
hl = Dense(hidden_units, activation='relu')(pl)
out = Dense(1, name="out")(hl)
model = Model(inputs=il, outputs=out)
print(model.summary())
model.compile(optimizer=OPTIMIZER, loss='huber', metrics=['mae','mse'])

## CCN3 Full

In [None]:
log_dir = "runs/ccnn3/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

EMBEDDING_DIM = 100
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 100

vocab_size = len(tokenizer.word_index) + 1
dropout_rate = 0.5
EPOCHS=25
BATCH_SIZE = 30
OPTIMIZER = "adamax"

inputs = Input(shape=(max_len,))
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            # weights=[embedding_matrix],
                            # trainable=False,
                            input_length=max_len)(inputs)
reshape = Reshape((max_len,EMBEDDING_DIM,1))(embedding_layer)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_len - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_len - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_len - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(dropout_rate)(flatten)
outTime = Dense(1, name="outTime")(dropout)
outSize = Dense(1, name="outSize")(dropout)

model = Model(inputs=inputs, outputs=[outTime, outSize])

model.compile(optimizer=OPTIMIZER, loss={"outTime": 'huber', "outSize": "huber"}, metrics={'outTime': ['mae','mse'], 'outSize':['mae','mse']})
print(model.summary())

## CCN3 Single

In [None]:
log_dir = "runs/ccnn3Single/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

EMBEDDING_DIM = 100
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 100

vocab_size = len(tokenizer.word_index) + 1
dropout_rate = 0.5
EPOCHS=250
BATCH_SIZE = 512
OPTIMIZER = "adamax"

inputs = Input(shape=(max_len,))
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            # weights=[embedding_matrix],
                            # trainable=False,
                            input_length=max_len)(inputs)
reshape = Reshape((max_len,EMBEDDING_DIM,1))(embedding_layer)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_len - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_len - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_len - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(dropout_rate)(flatten)
out = Dense(1, name="out")(dropout)

model = Model(inputs=inputs, outputs=out)

model.compile(optimizer=OPTIMIZER, loss='huber', metrics=['mae','mse'])
print(model.summary())

## VDCNN

### Models

#### Full

In [None]:
N_BLOCKS = {9: (1, 1, 1, 1),
            17: (2, 2, 2, 2),
            29: (5, 5, 2, 2),
            49: (8, 8, 5, 3)}


class KMaxPooling(layers.Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """

    def __init__(self,
                 k=None,
                 sorted=False):
        super(KMaxPooling, self).__init__()
        self.k = k
        self.sorted = sorted

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.k, input_shape[2])

    def call(self,
             inputs):
        if self.k is None:
            k = int(tf.round(inputs.shape[1] / 2))
        else:
            k = self.k

        # Swap last two dimensions since top_k will be applied along the last dimension
        shifted_inputs = tf.transpose(inputs, [0, 2, 1])

        # Extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_inputs, k=k, sorted=self.sorted)[0]

        # return flattened output
        return tf.transpose(top_k, [0, 2, 1])


class Pooling(layers.Layer):
    """Wrapper for different pooling operations.
    Including maxpooling and k-maxpooling.
    """

    def __init__(self,
                 pool_type='max',
                 name=None):
        super(Pooling, self).__init__(name=name)
        assert pool_type in ['max', 'k_max']
        self.pool_type = pool_type

        if pool_type == 'max':
            self.pool = layers.MaxPooling1D(pool_size=3, strides=2, padding='same')
        elif pool_type == 'k_max':
            self.pool = KMaxPooling()

    def call(self,
             x):
        return self.pool(x)


class ZeroPadding(layers.Layer):
    def __init__(self,
                 values,
                 name=None):
        super(ZeroPadding, self).__init__(name=name)
        self.values = values

    def call(self,
             x):
        x = tf.pad(x, [[0, 0], [0, 0], [self.values[0], self.values[1]]],
                   mode='CONSTANT', constant_values=0)
        return x


class Conv1D_BN(layers.Layer):
    """A stack of conv 1x1 and BatchNorm.
    """

    def __init__(self,
                 filters,
                 kernel_size=3,
                 strides=2,
                 padding='same',
                 use_bias=True,
                 name=None):
        super(Conv1D_BN, self).__init__(name=name)
        self.filters = filters
        self.use_bias = use_bias
        self.conv = layers.Conv1D(filters, kernel_size, strides=strides, padding=padding, use_bias=use_bias,
                                  kernel_initializer='he_normal')
        self.bn = layers.BatchNormalization()

    def call(self,
             x):
        x = self.conv(x)
        x = self.bn(x)
        return x


class ConvBlock(layers.Layer):
    """Conv block with downsampling.
    1x1 conv to increase dimensions.
    """

    def __init__(self,
                 filters,
                 kernel_size=3,
                 use_bias=True,
                 shortcut=True,
                 pool_type=None,
                 proj_type=None,
                 name=None,
                 ):
        super(ConvBlock, self).__init__(name=name)
        self.filters = filters
        self.kernel_size = kernel_size
        self.use_bias = use_bias
        self.shortcut = shortcut
        self.pool_type = pool_type
        self.proj_type = proj_type

        # Deal with downsample and pooling
        assert pool_type in ['max', 'k_max', 'conv', None]
        if pool_type is None:
            strides = 1
            self.pool = None
            self.downsample = None

        elif pool_type == 'conv':
            strides = 2  # Convolutional pooling with stride 2
            self.pool = None
            if shortcut:
                self.downsample = Conv1D_BN(filters, 3, strides=2, padding='same', use_bias=use_bias)

        else:
            strides = 1
            self.pool = Pooling(pool_type)
            if shortcut:
                self.downsample = Conv1D_BN(filters, 3, strides=2, padding='same', use_bias=use_bias)

        self.conv1 = layers.Conv1D(filters, kernel_size, strides=strides, padding='same', use_bias=use_bias,
                                   kernel_initializer='he_normal')
        self.bn1 = layers.BatchNormalization()

        self.conv2 = layers.Conv1D(filters, kernel_size, strides=1, padding='same', use_bias=use_bias,
                                   kernel_initializer='he_normal')
        self.bn2 = layers.BatchNormalization()

        assert proj_type in ['identity', 'conv', None]
        if shortcut:
            if proj_type == 'conv':
                # 1x1 conv for projection
                self.proj = Conv1D_BN(filters * 2, 1, strides=1, padding='same', use_bias=use_bias)

            elif proj_type == 'identity':
                # Identity using zero padding
                self.proj = ZeroPadding([int(filters // 2), filters - int(filters // 2)])

    def call(self,
             x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = tf.nn.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.pool is not None:
            out = self.pool(out)

        if self.shortcut:
            if self.downsample is not None:
                residual = self.downsample(residual)
            out += residual

        out = tf.nn.relu(out)

        if self.proj_type is not None and self.shortcut:
            out = self.proj(out)

        return out


class VDCNN(Model):
    """Model codebase for VDCNN.
    Args:
        depth: depth of VDCNN, one of [9, 17, 29, 49].
        seqlen: Sequence length.
        embed_dim: dim for character embeddings.
        shortcut: Use skip connections.
        pool_type: Pooling operations to be used, one of ['max', 'k_max', 'conv'].
        proj_type: Operation to increase dim for dotted skip connection, one of ['identity', 'conv'].
        use_bias: Use bias for all layers or not.
        logits: If False, return softmax probs.
    """

    def __init__(self,
                 depth=9,
                 vocab_size=69,
                 seqlen=None,
                 embed_dim=16,
                 shortcut=True,
                 pool_type='max',
                 proj_type='conv',
                 use_bias=True,
                 logits=True):
        super(VDCNN, self).__init__()
        self.depth = depth
        self.vocab_size = vocab_size
        self.seqlen = seqlen
        self.embed_dim = embed_dim
        self.shortcut = shortcut
        self.pool_type = pool_type
        self.proj_type = proj_type
        self.use_bias = use_bias
        self.logits = True

        assert pool_type in ['max', 'k_max', 'conv']
        assert proj_type in ['conv', 'identity']
        self.n_blocks = N_BLOCKS[depth]

        self.embed_char = layers.Embedding(vocab_size, embed_dim, input_length=seqlen)
        self.conv = layers.Conv1D(64, 3, strides=1, padding='same', use_bias=use_bias,
                                  kernel_initializer='he_normal')

        # Convolutional Block 64
        self.conv_block_64 = []
        for _ in range(self.n_blocks[0] - 1):
            self.conv_block_64.append(ConvBlock(64, 3, use_bias, shortcut))
        self.conv_block_64.append(ConvBlock(64, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 128
        self.conv_block_128 = []
        for _ in range(self.n_blocks[1] - 1):
            self.conv_block_128.append(ConvBlock(128, 3, use_bias, shortcut))
        self.conv_block_128.append(ConvBlock(128, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 256
        self.conv_block_256 = []
        for _ in range(self.n_blocks[2] - 1):
            self.conv_block_256.append(ConvBlock(256, 3, use_bias, shortcut))
        self.conv_block_256.append(ConvBlock(256, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 512
        self.conv_block_512 = []
        for _ in range(self.n_blocks[3] - 1):
            self.conv_block_512.append(ConvBlock(512, 3, use_bias, shortcut))
        self.conv_block_512.append(ConvBlock(512, 3, use_bias, shortcut, pool_type=None, proj_type=None))

        self.k_maxpool = KMaxPooling(k=8)
        self.flatten = layers.Flatten()

        # Dense layers
        self.fc1 = layers.Dense(2048, activation='relu')
        self.fc2 = layers.Dense(2048, activation='relu')
        self.outTime = layers.Dense(1, name="outTime")
        self.outSize = layers.Dense(1, name="outSize")
        # self.out = layers.Dense(2)

    def call(self,
             x):
        x = self.embed_char(x)
        # print('embed:', x.shape)
        x = self.conv(x)
        # print('conv:', x.shape)

        for l in self.conv_block_64:
            x = l(x)
        # print('conv_block_64:', x.shape)

        for l in self.conv_block_128:
            x = l(x)
        # print('conv_block_128:', x.shape)

        for l in self.conv_block_256:
            x = l(x)
        # print('conv_block_256:', x.shape)

        for l in self.conv_block_512:
            x = l(x)
        # print('conv_block_512:', x.shape)

        x = self.k_maxpool(x)
        # print('k_maxpool_8:', x.shape)
        x = self.flatten(x)
        # print('flatten:', x.shape)

        x = self.fc1(x)
        x = self.fc2(x)
        outTime = self.outTime(x)
        outSize = self.outSize(x)
        # out = self.out(x)
        # print('out:', out.shape)

        return outTime, outSize

#### Single

In [None]:
N_BLOCKS = {9: (1, 1, 1, 1),
            17: (2, 2, 2, 2),
            29: (5, 5, 2, 2),
            49: (8, 8, 5, 3)}


class KMaxPooling(layers.Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """

    def __init__(self,
                 k=None,
                 sorted=False):
        super(KMaxPooling, self).__init__()
        self.k = k
        self.sorted = sorted

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.k, input_shape[2])

    def call(self,
             inputs):
        if self.k is None:
            k = int(tf.round(inputs.shape[1] / 2))
        else:
            k = self.k

        # Swap last two dimensions since top_k will be applied along the last dimension
        shifted_inputs = tf.transpose(inputs, [0, 2, 1])

        # Extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_inputs, k=k, sorted=self.sorted)[0]

        # return flattened output
        return tf.transpose(top_k, [0, 2, 1])


class Pooling(layers.Layer):
    """Wrapper for different pooling operations.
    Including maxpooling and k-maxpooling.
    """

    def __init__(self,
                 pool_type='max',
                 name=None):
        super(Pooling, self).__init__(name=name)
        assert pool_type in ['max', 'k_max']
        self.pool_type = pool_type

        if pool_type == 'max':
            self.pool = layers.MaxPooling1D(pool_size=3, strides=2, padding='same')
        elif pool_type == 'k_max':
            self.pool = KMaxPooling()

    def call(self,
             x):
        return self.pool(x)


class ZeroPadding(layers.Layer):
    def __init__(self,
                 values,
                 name=None):
        super(ZeroPadding, self).__init__(name=name)
        self.values = values

    def call(self,
             x):
        x = tf.pad(x, [[0, 0], [0, 0], [self.values[0], self.values[1]]],
                   mode='CONSTANT', constant_values=0)
        return x


class Conv1D_BN(layers.Layer):
    """A stack of conv 1x1 and BatchNorm.
    """

    def __init__(self,
                 filters,
                 kernel_size=3,
                 strides=2,
                 padding='same',
                 use_bias=True,
                 name=None):
        super(Conv1D_BN, self).__init__(name=name)
        self.filters = filters
        self.use_bias = use_bias
        self.conv = layers.Conv1D(filters, kernel_size, strides=strides, padding=padding, use_bias=use_bias,
                                  kernel_initializer='he_normal')
        self.bn = layers.BatchNormalization()

    def call(self,
             x):
        x = self.conv(x)
        x = self.bn(x)
        return x


class ConvBlock(layers.Layer):
    """Conv block with downsampling.
    1x1 conv to increase dimensions.
    """

    def __init__(self,
                 filters,
                 kernel_size=3,
                 use_bias=True,
                 shortcut=True,
                 pool_type=None,
                 proj_type=None,
                 name=None,
                 ):
        super(ConvBlock, self).__init__(name=name)
        self.filters = filters
        self.kernel_size = kernel_size
        self.use_bias = use_bias
        self.shortcut = shortcut
        self.pool_type = pool_type
        self.proj_type = proj_type

        # Deal with downsample and pooling
        assert pool_type in ['max', 'k_max', 'conv', None]
        if pool_type is None:
            strides = 1
            self.pool = None
            self.downsample = None

        elif pool_type == 'conv':
            strides = 2  # Convolutional pooling with stride 2
            self.pool = None
            if shortcut:
                self.downsample = Conv1D_BN(filters, 3, strides=2, padding='same', use_bias=use_bias)

        else:
            strides = 1
            self.pool = Pooling(pool_type)
            if shortcut:
                self.downsample = Conv1D_BN(filters, 3, strides=2, padding='same', use_bias=use_bias)

        self.conv1 = layers.Conv1D(filters, kernel_size, strides=strides, padding='same', use_bias=use_bias,
                                   kernel_initializer='he_normal')
        self.bn1 = layers.BatchNormalization()

        self.conv2 = layers.Conv1D(filters, kernel_size, strides=1, padding='same', use_bias=use_bias,
                                   kernel_initializer='he_normal')
        self.bn2 = layers.BatchNormalization()

        assert proj_type in ['identity', 'conv', None]
        if shortcut:
            if proj_type == 'conv':
                # 1x1 conv for projection
                self.proj = Conv1D_BN(filters * 2, 1, strides=1, padding='same', use_bias=use_bias)

            elif proj_type == 'identity':
                # Identity using zero padding
                self.proj = ZeroPadding([int(filters // 2), filters - int(filters // 2)])

    def call(self,
             x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = tf.nn.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.pool is not None:
            out = self.pool(out)

        if self.shortcut:
            if self.downsample is not None:
                residual = self.downsample(residual)
            out += residual

        out = tf.nn.relu(out)

        if self.proj_type is not None and self.shortcut:
            out = self.proj(out)

        return out


class VDCNN(Model):
    """Model codebase for VDCNN.
    Args:
        depth: depth of VDCNN, one of [9, 17, 29, 49].
        seqlen: Sequence length.
        embed_dim: dim for character embeddings.
        shortcut: Use skip connections.
        pool_type: Pooling operations to be used, one of ['max', 'k_max', 'conv'].
        proj_type: Operation to increase dim for dotted skip connection, one of ['identity', 'conv'].
        use_bias: Use bias for all layers or not.
        logits: If False, return softmax probs.
    """

    def __init__(self,
                 depth=9,
                 vocab_size=69,
                 seqlen=None,
                 embed_dim=16,
                 shortcut=True,
                 pool_type='max',
                 proj_type='conv',
                 use_bias=True,
                 logits=True):
        super(VDCNN, self).__init__()
        self.depth = depth
        self.vocab_size = vocab_size
        self.seqlen = seqlen
        self.embed_dim = embed_dim
        self.shortcut = shortcut
        self.pool_type = pool_type
        self.proj_type = proj_type
        self.use_bias = use_bias
        self.logits = True

        assert pool_type in ['max', 'k_max', 'conv']
        assert proj_type in ['conv', 'identity']
        self.n_blocks = N_BLOCKS[depth]

        self.embed_char = layers.Embedding(vocab_size, embed_dim, input_length=seqlen)
        self.conv = layers.Conv1D(64, 3, strides=1, padding='same', use_bias=use_bias,
                                  kernel_initializer='he_normal')

        # Convolutional Block 64
        self.conv_block_64 = []
        for _ in range(self.n_blocks[0] - 1):
            self.conv_block_64.append(ConvBlock(64, 3, use_bias, shortcut))
        self.conv_block_64.append(ConvBlock(64, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 128
        self.conv_block_128 = []
        for _ in range(self.n_blocks[1] - 1):
            self.conv_block_128.append(ConvBlock(128, 3, use_bias, shortcut))
        self.conv_block_128.append(ConvBlock(128, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 256
        self.conv_block_256 = []
        for _ in range(self.n_blocks[2] - 1):
            self.conv_block_256.append(ConvBlock(256, 3, use_bias, shortcut))
        self.conv_block_256.append(ConvBlock(256, 3, use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

        # Convolutional Block 512
        self.conv_block_512 = []
        for _ in range(self.n_blocks[3] - 1):
            self.conv_block_512.append(ConvBlock(512, 3, use_bias, shortcut))
        self.conv_block_512.append(ConvBlock(512, 3, use_bias, shortcut, pool_type=None, proj_type=None))

        self.k_maxpool = KMaxPooling(k=8)
        self.flatten = layers.Flatten()

        # Dense layers
        self.fc1 = layers.Dense(2048, activation='relu')
        self.fc2 = layers.Dense(2048, activation='relu')
        self.out = layers.Dense(1, name="out")
        # self.out = layers.Dense(2)

    def call(self,
             x):
        x = self.embed_char(x)
        # print('embed:', x.shape)
        x = self.conv(x)
        # print('conv:', x.shape)

        for l in self.conv_block_64:
            x = l(x)
        # print('conv_block_64:', x.shape)

        for l in self.conv_block_128:
            x = l(x)
        # print('conv_block_128:', x.shape)

        for l in self.conv_block_256:
            x = l(x)
        # print('conv_block_256:', x.shape)

        for l in self.conv_block_512:
            x = l(x)
        # print('conv_block_512:', x.shape)

        x = self.k_maxpool(x)
        # print('k_maxpool_8:', x.shape)
        x = self.flatten(x)
        # print('flatten:', x.shape)

        x = self.fc1(x)
        x = self.fc2(x)
        out = self.out(x)
        # outSize = self.outSize(x)
        # out = self.out(x)
        # print('out:', out.shape)

        return out

### Hyperparameter

In [None]:
log_dir = "runs/vdcnn/depth9/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Hyperparameters
DEPTH = 9
EMBED_DIM = 256
SHORTCUT = True
POOL_TYPE = 'k_max'
PROJ_TYPE = 'conv'
USE_BIAS = False
OPTIMIZER = "adam"

In [None]:
log_dir = "runs/vdcnn/depth17/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Hyperparameters
DEPTH = 17
EMBED_DIM = 256
SHORTCUT = 'True'
POOL_TYPE = 'k_max'
PROJ_TYPE = 'identity'
USE_BIAS = True
OPTIMIZER = 'adam'

In [None]:
log_dir = "runs/vdcnn/depth29/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Hyperparameters
DEPTH = 29
EMBED_DIM = 256
SHORTCUT = 'True'
POOL_TYPE = 'k_max'
PROJ_TYPE = 'identity'
USE_BIAS = True
OPTIMIZER = 'adam'

In [None]:
log_dir = "runs/vdcnn/depth49/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Hyperparameters
DEPTH = 49
EMBED_DIM = 256
SHORTCUT = 'True'
POOL_TYPE = 'conv'
PROJ_TYPE = 'conv'
USE_BIAS = False
OPTIMIZER = 'adam'

### Compiling

In [None]:
# Full
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
BATCH_SIZE = 256
EPOCHS = 50
vocab_size = len(tokenizer.word_index) + 1

# Model
model = VDCNN(depth=DEPTH,
              vocab_size=vocab_size,
              seqlen=max_len,
              embed_dim=EMBED_DIM,
              shortcut=SHORTCUT,
              pool_type=POOL_TYPE,
              proj_type=PROJ_TYPE,
              use_bias=USE_BIAS)
model.compile(optimizer=OPTIMIZER, loss={"output_1": 'huber', "output_2": "huber"}, metrics={'output_1': ['mae','mse'], 'output_2':['mae','mse']})

In [None]:
# Single
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
BATCH_SIZE = 256
EPOCHS = 200
vocab_size = len(tokenizer.word_index) + 1

# Model
model = VDCNN(depth=DEPTH,
              vocab_size=vocab_size,
              seqlen=max_len,
              embed_dim=EMBED_DIM,
              shortcut=SHORTCUT,
              pool_type=POOL_TYPE,
              proj_type=PROJ_TYPE,
              use_bias=USE_BIAS)
model.compile(optimizer=OPTIMIZER, loss='huber', metrics=['mae','mse'])

# Train

## Full

In [None]:
model.fit(padded, [data_runtime, data_resultsize],
          epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2,
      callbacks=[tensorboard_callback],
      shuffle=False,
      validation_split=0.2)

In [None]:
model.save("models/CNN1.h5")

In [None]:
model.save("models/vdcnn49")

## Runtime

In [None]:
model.fit(padded, data_runtime, 
          epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2,
      callbacks=[tensorboard_callback],
      shuffle=False,
      validation_split=0.2)

In [None]:
model.save("models/vdcnn49Rt")

In [None]:
model.save("models/ccnn3Rt.h5")

## Resultsize

In [None]:
model.fit(padded, data_resultsize,
          epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2,
      callbacks=[tensorboard_callback],
      shuffle=False,
      validation_split=0.2)

In [None]:
model.save("models/vdcnn17Card")

In [None]:
model.save("models/ccnn3Card.h5")

# Evaluate

## Load Model

In [None]:
model = load_model("models/CNN1Card.h5")

In [None]:
model = load_model("models/vdcnn49Rt")

## Load Evaluation Data

In [None]:
reload(logging)
logging.basicConfig(filename='logs/ccnn.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

In [None]:
reload(logging)
logging.basicConfig(filename='logs/vdcnn49.log', level=logging.DEBUG, format="%(asctime)s    %(message)s",
                              datefmt="%H:%M")

In [21]:
# Evaluate model on test data
data = pd.read_csv(f"{path}test.csv", on_bad_lines="skip", encoding="latin-1", lineterminator="\n",
                                dtype={"statement": str, "runtime": float, "resultsize": int, "yy": int,
                                       "mm:": int, "dd": int}, memory_map=True)
print(len(data))
# Tokenize input SQL statements
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data["statement"])

# Pad input sequences
max_len = 512
sequences = tokenizer.texts_to_sequences(data["statement"])
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

min_runtime = min(data["runtime"])
min_resultsize = min(data["resultsize"])

data_runtime = np.log(data["runtime"] + 1 - min_runtime)
data_resultsize = np.log(data["resultsize"] + 1 - min_resultsize)

3315


## Full

In [None]:
result = model.evaluate(padded, [data_runtime, data_resultsize], return_dict=True) # [data_runtime, data_resultsize]
# logging.info(result)

In [None]:
preds = model.predict(padded)

# convert to actual predictions
pred_time = [np.exp(x[0])-1+min_runtime for x in preds[0]]
pred_size = [np.exp(x[0])-1+min_resultsize for x in preds[1]]

qerror_time = [max(pred_time[i] / data["runtime"][i], data["runtime"][i] / pred_time[i]) for i in range(len(data["runtime"]))]
qerror_size = [max(max(pred_size[i],1) / max(1,data["resultsize"][i]), max(1,data["resultsize"][i]) / max(1,pred_size[i])) for i in range(len(data["resultsize"]))]

for (qerror,name) in [(qerror_time,"runtime"), (qerror_size,"resultsize")]:
    logging.info("")
    logging.info(f"Qerror for {name}")
    logging.info("Median: {}".format(np.median(qerror)))
    logging.info("Mean: {}".format(np.mean(qerror)))
    logging.info("Max: {}".format(np.max(qerror)))
    logging.info("10th percentile: {}".format(np.percentile(qerror, 10)))
    logging.info("20th percentile: {}".format(np.percentile(qerror, 20)))
    logging.info("30th percentile: {}".format(np.percentile(qerror, 30)))
    logging.info("40th percentile: {}".format(np.percentile(qerror, 40)))
    logging.info("50th percentile: {}".format(np.percentile(qerror, 50)))
    logging.info("60th percentile: {}".format(np.percentile(qerror, 60)))
    logging.info("70th percentile: {}".format(np.percentile(qerror, 70)))
    logging.info("80th percentile: {}".format(np.percentile(qerror, 80)))
    logging.info("90th percentile: {}".format(np.percentile(qerror, 90)))
    logging.info("95th percentile: {}".format(np.percentile(qerror, 95)))
    logging.info("98th percentile: {}".format(np.percentile(qerror, 98)))

## Evaluation Runtime

In [None]:
result = model.evaluate(padded, data_runtime, return_dict=True)
logging.info(result)

In [None]:
preds = model.predict(padded)

# convert to actual predictions
pred_time = [np.exp(x[0])-1+min_runtime for x in preds]
qerror = [max(pred_time[i] / data["runtime"][i], data["runtime"][i] / pred_time[i]) for i in range(len(data["runtime"]))]

logging.info("")
logging.info(f"Qerror for runtime")
logging.info("Median: {}".format(np.median(qerror)))
logging.info("Mean: {}".format(np.mean(qerror)))
logging.info("Max: {}".format(np.max(qerror)))
logging.info("10th percentile: {}".format(np.percentile(qerror, 10)))
logging.info("20th percentile: {}".format(np.percentile(qerror, 20)))
logging.info("30th percentile: {}".format(np.percentile(qerror, 30)))
logging.info("40th percentile: {}".format(np.percentile(qerror, 40)))
logging.info("50th percentile: {}".format(np.percentile(qerror, 50)))
logging.info("60th percentile: {}".format(np.percentile(qerror, 60)))
logging.info("70th percentile: {}".format(np.percentile(qerror, 70)))
logging.info("80th percentile: {}".format(np.percentile(qerror, 80)))
logging.info("90th percentile: {}".format(np.percentile(qerror, 90)))
logging.info("95th percentile: {}".format(np.percentile(qerror, 95)))
logging.info("98th percentile: {}".format(np.percentile(qerror, 98)))

## Evaluation Resultsize

In [None]:
result = model.evaluate(padded, data_resultsize, return_dict=True)
logging.info(result)

In [None]:
preds = model.predict(padded)

# convert to actual predictions
pred_size = [np.exp(x[0])-1+min_resultsize for x in preds]
qerror = [max(max(pred_size[i],1) / max(1,data["resultsize"][i]), max(1,data["resultsize"][i]) / max(1,pred_size[i])) for i in range(len(data["resultsize"]))]

logging.info("")
logging.info(f"Qerror for resultsize")
logging.info("Median: {}".format(np.median(qerror)))
logging.info("Mean: {}".format(np.mean(qerror)))
logging.info("Max: {}".format(np.max(qerror)))
logging.info("10th percentile: {}".format(np.percentile(qerror, 10)))
logging.info("20th percentile: {}".format(np.percentile(qerror, 20)))
logging.info("30th percentile: {}".format(np.percentile(qerror, 30)))
logging.info("40th percentile: {}".format(np.percentile(qerror, 40)))
logging.info("50th percentile: {}".format(np.percentile(qerror, 50)))
logging.info("60th percentile: {}".format(np.percentile(qerror, 60)))
logging.info("70th percentile: {}".format(np.percentile(qerror, 70)))
logging.info("80th percentile: {}".format(np.percentile(qerror, 80)))
logging.info("90th percentile: {}".format(np.percentile(qerror, 90)))
logging.info("95th percentile: {}".format(np.percentile(qerror, 95)))
logging.info("98th percentile: {}".format(np.percentile(qerror, 98)))