## Process data

In [None]:
buggy_data = ['((x + y) >= (z - 1))',
              '(a && b)',
              '(c > 0)',
              'd',
              '(e > f)']
fixed_data = ['((x + y) > (z - 1))',
              '(a && !(b))',
              '(c > 1)',
              '!(d)',
              '(f > e)']

buggy_codes = [list(x.replace(' ', '')) for x in buggy_data]
fixed_codes = [['<soc>']+list(x.replace(' ', ''))+['<eoc>'] for x in fixed_data]

print("Buggy codes:-")
for x in buggy_codes:
    print(x)
print("====================")
print("Fixed codes:-")
for x in fixed_codes:
    print(x)
    
vocab = set([x for y in buggy_codes for x in y]+[x for y in fixed_codes for x in y])
token_int_map = dict([(token, i+1) for i, token in enumerate(vocab)])
vocab.add('<pad/unknown>')
token_int_map['<pad/unknown>'] = 0
int_token_map = dict((i, token) for token, i in token_int_map.items())

print(int_token_map)

vocab_size = len(vocab)
max_buggy_len = max([len(txt) for txt in buggy_codes])
max_fixed_len = max([len(txt) for txt in fixed_codes])
num_dps = len(fixed_codes)

print('Number of data points:', num_dps)
print('Vocabulary size:', vocab_size)
print('Max length in buggy codes:', max_buggy_len)
print('Max length in fixed codes:', max_fixed_len)


import numpy as np


buggy_inputs =  np.zeros((num_dps, max_buggy_len), dtype='int32')
fixed_inputs =  np.zeros((num_dps, max_fixed_len), dtype='int32')
fixed_outputs = np.zeros((num_dps, max_fixed_len, vocab_size), dtype='float32')

for i, (buggy, fixed) in enumerate(zip(buggy_codes, fixed_codes)):
    for t, token in enumerate(buggy):
        buggy_inputs[i, t] = token_int_map[token]
    for t, token in enumerate(fixed):
        int_value = token_int_map[token]
        fixed_inputs[i, t] = int_value
        if t > 0:
            fixed_outputs[i, t-1, int_value] = 1.
    fixed_outputs[i, t, 0] = 1.

## LSTM Encoder Decoder

In [1]:
from keras.layers import Input, Embedding, LSTM, Dense, dot, Activation, concatenate
from keras.models import Model


def build_lstm_encoder_decoder(dimension, v_size, buggy_len, fixed_len):
    # Encoder
    buggy_input_layer = Input(shape=(buggy_len,))
    enc_embed_lay = Embedding(v_size, dimension,  mask_zero=True)(buggy_input_layer)
    encoder_outputs, state_h, state_c = LSTM(dimension, return_sequences=True, return_state=True)(enc_embed_lay)
    # Decoder
    fixed_input_layer = Input(shape=(fixed_len,))
    dec_embed_lay = Embedding(v_size, dimension, mask_zero=True)(fixed_input_layer)
    decoder_outputs = LSTM(dimension, return_sequences=True)(dec_embed_lay, initial_state=[state_h, state_c])
    # Attention
    attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
    attention = Activation('softmax', name='attention')(attention)
    context = dot([attention, encoder_outputs], axes=[2, 1])
    decoder_combined_context = concatenate([context, decoder_outputs])
    attention_context_output = Dense(dimension, activation="tanh")(decoder_combined_context)
    # Model output
    model_output = Dense(v_size, activation="softmax")(attention_context_output)
    # Build & compile model
    enc_dec = Model([buggy_input_layer, fixed_input_layer], model_output)
    enc_dec.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    
    return enc_dec


%matplotlib inline
from keras.utils.vis_utils import plot_model
from IPython.display import Image


latent_dim = 512

encoder_decoder = build_lstm_encoder_decoder(latent_dim, vocab_size, max_buggy_len, max_fixed_len)
plot_model(encoder_decoder, to_file='lstm_encoder_decoder.png', show_shapes=True, show_layer_names=True)
encoder_decoder.summary()

epochs = 20

encoder_decoder.fit([buggy_inputs, fixed_inputs], fixed_outputs, epochs=epochs)


def generate_fixed_ints(enc_dec, bugs, fixed_len, token_map, int_map):
    gntd_ints = np.zeros(shape=(len(bugs), fixed_len))
    gntd_ints[:, 0] = token_map["<soc>"]
    for buggy, generated in zip(bugs, gntd_ints):
        buggy_input = buggy[np.newaxis]
        gntd_in_out = generated[np.newaxis]
        for i in range(1, fixed_len):
            prediction = enc_dec.predict([buggy_input, gntd_in_out]).argmax(axis=2)
            if int_map[prediction[:, i][0]] == "<eoc>":
                break
            generated[i] = prediction[:, i]
    
    return gntd_ints


def decode_ints(int_matrix, int_map):
    gntd_codes = []
    for ints in int_matrix:
        code = [int_map[x] for x in ints if x != 0]
        gntd_codes.append(code)
        
    return gntd_codes


print('=============')
print('=============')
print('=============')
generated_ints = generate_fixed_ints(encoder_decoder, buggy_inputs, max_fixed_len, token_int_map, int_token_map)
generated_codes = decode_ints(generated_ints, int_token_map)
for buggy, fixed, gnrtd in zip(buggy_codes, fixed_codes, generated_codes):
    print('=============')
    print('Buggy code:', ' '.join(buggy[1:-1]))
    print('Fixed code:', ' '.join(fixed[1:-1]))
    print('Genration: ', ' '.join(gnrtd[1:]))

Buggy codes:-
['(', '(', 'x', '+', 'y', ')', '>', '=', '(', 'z', '-', '1', ')', ')']
['(', 'a', '&', '&', 'b', ')']
['(', 'c', '>', '0', ')']
['d']
['(', 'e', '>', 'f', ')']
Fixed codes:-
['<soc>', '(', '(', 'x', '+', 'y', ')', '>', '(', 'z', '-', '1', ')', ')', '<eoc>']
['<soc>', '(', 'a', '&', '&', '!', '(', 'b', ')', ')', '<eoc>']
['<soc>', '(', 'c', '>', '1', ')', '<eoc>']
['<soc>', '!', '(', 'd', ')', '<eoc>']
['<soc>', '(', 'f', '>', 'e', ')', '<eoc>']
{1: '<soc>', 2: '=', 3: '!', 4: 'd', 5: '0', 6: 'b', 7: '-', 8: ')', 9: '>', 10: '+', 11: '<eoc>', 12: 'a', 13: '(', 14: 'x', 15: '1', 16: 'c', 17: 'e', 18: 'z', 19: 'y', 20: 'f', 21: '&', 0: '<pad/unknown>'}
Number of data points: 5
Vocabulary size: 22
Max length in buggy codes: 14
Max length in fixed codes: 15


Using TensorFlow backend.
W0913 23:20:41.585980 140029570557696 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0913 23:20:41.596524 140029570557696 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0913 23:20:41.659328 140029570557696 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0913 23:20:41.972396 140029570557696 deprecation.py:323] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (fro

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 14)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 14, 512)      11264       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 15, 512)      11264       input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

W0913 23:20:43.750517 140029570557696 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Buggy code: ( x + y ) > = ( z - 1 )
Fixed code: ( ( x + y ) > ( z - 1 ) )
Genration:  ( x + y y ) )
Buggy code: a & & b
Fixed code: ( a & & ! ( b ) )
Genration:  ( a & & ! ( b ) )
Buggy code: c > 0
Fixed code: ( c > 1 )
Genration:  ( c > > )
Buggy code: 
Fixed code: ! ( d )
Genration:  ! ( d )
Buggy code: e > f
Fixed code: ( f > e )
Genration:  ( f > > )


## GANs

In [1]:
from keras.layers import Input, Concatenate, Embedding, LSTM, Dense, dot, Activation, concatenate, Lambda
from keras.models import Model
from keras.backend import argmax, cast


def build_discriminator(dimension, v_size, buggy_len, fixed_len):
    buggy_input_layer = Input(shape=(buggy_len,))
    fixed_input_layer = Input(shape=(fixed_len,))
    concatted = Concatenate()([buggy_input_layer, fixed_input_layer])
    embed_lay = Embedding(v_size, dimension, mask_zero=True)(concatted)
    x = LSTM(dimension)(embed_lay)
    out = Dense(1, activation='sigmoid')(x)
    disc = Model([buggy_input_layer, fixed_input_layer], out)
    disc.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], loss_weights=[0.5])
    
    return disc


def build_generator(dimension, v_size, buggy_len, fixed_len):
    # Encoder
    buggy_input_layer = Input(shape=(buggy_len,))
    enc_embed_lay = Embedding(v_size, dimension, mask_zero=True)(buggy_input_layer)
    encoder_outputs, state_h, state_c = LSTM(dimension, return_sequences=True, return_state=True)(enc_embed_lay)
    # Decoder
    fixed_input_layer = Input(shape=(fixed_len,))
    dec_embed_lay = Embedding(v_size, dimension, mask_zero=True)(fixed_input_layer)
    decoder_outputs = LSTM(dimension, return_sequences=True)(dec_embed_lay, initial_state=[state_h, state_c])
    # Attention
    attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
    attention = Activation('softmax', name='attention')(attention)
    context = dot([attention, encoder_outputs], axes=[2, 1])
    decoder_combined_context = concatenate([context, decoder_outputs])
    attention_context_output = Dense(dimension, activation="tanh")(decoder_combined_context)
    # Model output
    model_output = Dense(v_size, activation="softmax")(attention_context_output)
    # Build model
    gen = Model([buggy_input_layer, fixed_input_layer], model_output)
    
    return gen


def build_gan(gen, disc, buggy_len, fixed_len):
    disc.trainable = False
    buggy_input_layer = Input(shape=(buggy_len,))
    fixed_input_layer = Input(shape=(fixed_len,))
    gen_out = gen([buggy_input_layer, fixed_input_layer])
    argmax_layer = Lambda(lambda x: cast(argmax(x, axis=2), dtype='float32'))
    disc_out = disc([buggy_input_layer, argmax_layer(gen_out)])
    gan = Model([buggy_input_layer, fixed_input_layer], [disc_out, gen_out])
    # compile model
    gan.compile(loss=['binary_crossentropy', 'categorical_crossentropy'], optimizer='rmsprop', loss_weights=[1, 100])
    
    return gan


%matplotlib inline
from keras.utils.vis_utils import plot_model
from IPython.display import Image


latent_dim = 512

discriminator = build_discriminator(latent_dim, vocab_size, max_buggy_len, max_fixed_len)
plot_model(discriminator, to_file='discriminator_model_plot.png', show_shapes=True, show_layer_names=True)
# Image('discriminator_model_plot.png')

generator = build_generator(latent_dim, vocab_size, max_buggy_len, max_fixed_len)
plot_model(generator, to_file='generator_model_plot.png', show_shapes=True, show_layer_names=True)
# Image('generator_model_plot.png')

gan = build_gan(generator, discriminator, max_buggy_len, max_fixed_len)
plot_model(gan, to_file='gan_model_plot.png', show_shapes=True, show_layer_names=True)
# gan.summary()
# Image('gan_model_plot.png')


def generate_fixed_ints(gen, bugs, fixed_len, token_map, int_map):
    gntd_ints = np.zeros(shape=(len(bugs), fixed_len))
    gntd_ints[:, 0] = token_map["<soc>"]
    for buggy, generated in zip(bugs, gntd_ints):
        buggy_input = buggy[np.newaxis]
        gntd_in_out = generated[np.newaxis]
        for i in range(1, fixed_len):
            prediction = gen.predict([buggy_input, gntd_in_out]).argmax(axis=2)
            if int_map[prediction[:, i][0]] == "<eoc>":
                break
            generated[i] = prediction[:, i]
    
    return gntd_ints


epochs = 20

for e in range(epochs):
    discriminator.fit([buggy_inputs, fixed_inputs], np.ones(num_dps))
    generated_ints = generate_fixed_ints(generator, buggy_inputs, max_fixed_len, token_int_map, int_token_map)
    discriminator.fit([buggy_inputs, generated_ints], np.zeros(num_dps))
    gan.fit([buggy_inputs, fixed_inputs], [np.ones(num_dps), fixed_outputs])


def decode_ints(int_matrix, int_map):
    gntd_codes = []
    for ints in int_matrix:
        code = [int_map[x] for x in ints if x != 0]
        gntd_codes.append(code)
        
    return gntd_codes


print('=============')
print('=============')
print('=============')
generated_ints = generate_fixed_ints(generator, buggy_inputs, max_fixed_len, token_int_map, int_token_map)
generated_codes = decode_ints(generated_ints, int_token_map)
for buggy, fixed, gnrtd in zip(buggy_codes, fixed_codes, generated_codes):
    print('=============')
    print('Buggy code:', ' '.join(buggy[1:-1]))
    print('Fixed code:', ' '.join(fixed[1:-1]))
    print('Genration: ', ' '.join(gnrtd[1:]))

Buggy codes:-
['(', '(', 'x', '+', 'y', ')', '>', '=', '(', 'z', '-', '1', ')', ')']
['(', 'a', '&', '&', 'b', ')']
['(', 'c', '>', '0', ')']
['d']
['(', 'e', '>', 'f', ')']
Fixed codes:-
['<soc>', '(', '(', 'x', '+', 'y', ')', '>', '(', 'z', '-', '1', ')', ')', '<eoc>']
['<soc>', '(', 'a', '&', '&', '!', '(', 'b', ')', ')', '<eoc>']
['<soc>', '(', 'c', '>', '1', ')', '<eoc>']
['<soc>', '!', '(', 'd', ')', '<eoc>']
['<soc>', '(', 'f', '>', 'e', ')', '<eoc>']
{1: 'f', 2: '+', 3: '>', 4: 'e', 5: 'z', 6: 'y', 7: '1', 8: 'x', 9: '=', 10: '<soc>', 11: 'b', 12: '(', 13: '&', 14: 'a', 15: '-', 16: 'd', 17: '0', 18: '<eoc>', 19: ')', 20: 'c', 21: '!', 0: '<pad/unknown>'}
Number of data points: 5
Vocabulary size: 22
Max length in buggy codes: 14
Max length in fixed codes: 15


Using TensorFlow backend.
W0913 23:20:54.030289 140003070756608 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0913 23:20:54.040325 140003070756608 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0913 23:20:54.044531 140003070756608 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0913 23:20:54.357945 140003070756608 deprecation.py:323] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (fro

Epoch 1/1
Epoch 1/1


  'Discrepancy between trainable weights and collected trainable'


Epoch 1/1
Epoch 1/1


  'Discrepancy between trainable weights and collected trainable'


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Buggy code: ( x + y ) > = ( z - 1 )
Fixed code: ( ( x + y ) > ( z - 1 ) )
Genration:  ( ( ( x + y y ( y ( ) ) ( z
Buggy code: a & & b
Fixed code: ( a & & ! ( b ) )
Genration:  ( a & & ! ( b ) )
Buggy code: c > 0
Fixed code: ( c > 1 )
Genration:  ( c > ) )
Buggy code: 
Fixed code: ! ( d )
Genration:  ! ( d )
Buggy code: e > f
Fixed code: ( f > e )
Genration:  ( f > ) )
