In [8]:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
gpu = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)

result_path = '/home1/likejun/BadNameDetectionICSE2020/result'
data_path = '/home1/likejun/BadNameDetectionICSE2020/data/'
method_name_path = data_path + 'ParsedMethodNameTokens_1.txt'
context_path = data_path + 'ParsedMethodContextTokens_1.txt'
mark_start = 'ssss '
mark_end = ' eeee'


def load_data(src=True, start="", end=""):
    results = []
    if src:
        path = context_path
        with open(path) as file:
            texts = [start + line.strip() + end for line in file]
            results.extend(texts)
    else:
        path = method_name_path
        with open(path) as file:
            texts = [start + line.strip() + end for line in file]
            results.extend(texts)
    return results


data_src = load_data(src=True)
data_dest = load_data(src=False, start=mark_start, end=mark_end)

print(len(data_src))
print(len(data_dest))

idx = 10000
print(data_src[idx])
print(data_dest[idx])

num_words = 30000


class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""

    def __init__(self, texts, padding, reverse=False, num_words=None):

        Tokenizer.__init__(self, num_words=num_words, filters='"#$.?@\\^_`~\t\n', split=' ')
        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)
        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))

        self.tokens = self.texts_to_sequences(texts)
        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else self.index_to_word[token]
        return word

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]

        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text

    def text_to_tokens(self, text, reverse=False, padding=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens


tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

tokenizer_dest = TokenizerWrap(texts=data_dest,
                               padding='post',
                               reverse=False,
                               num_words=num_words)

tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

token_start = tokenizer_dest.word_index[mark_start.strip()]
print(token_start)
token_end = tokenizer_dest.word_index[mark_end.strip()]
print(token_end)

idx = 0
print(tokens_src[idx])
print(tokenizer_src.tokens_to_string(tokens_src[idx]))
print(data_src[idx])

print(tokens_dest[idx])
print(tokenizer_dest.tokens_to_string(tokens_dest[idx]))
print(data_dest[idx])

encoder_input_data = tokens_src
decoder_input_data = tokens_dest[:, :-1]
print(decoder_input_data.shape)

decoder_output_data = tokens_dest[:, 1:]
print(decoder_output_data.shape)

idx = 22
decoder_input_data[idx]
decoder_output_data[idx]
tokenizer_dest.tokens_to_string(decoder_input_data[idx])
tokenizer_dest.tokens_to_string(decoder_output_data[idx])

encoder_input = Input(shape=(None,), name='encoder_input')

embedding_size = 128

encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')

state_size = 512

encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)


def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input

    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net

    return encoder_output


encoder_output = connect_encoder()

decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

decoder_input = Input(shape=(None,), name='decoder_input')

decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')


def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)

    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)

    return decoder_output


decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])


# model_train.compile(optimizer=optimizer,
#                     loss='sparse_categorical_crossentropy')

def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.

    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean


optimizer = RMSprop(lr=1e-3)

model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy)

path_checkpoint = '14m_fse19_return_paras_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

callback_tensorboard = TensorBoard(log_dir='./14m_fse19_return_paras_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

2964074
2964074
call resolver void debugger breakpoint hit reply step big integer breakpoint address get program counter reply get thread id reply get register values count hit reply get thread id breakpoint address target get debugger single step debug exception e e print stack trace
ssss breakpoint hit eeee
(2964074, 172)
(2964074, 6)
1
2
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    2   15   30    2   44   30
    2   44   30    2  244    1  128    1  603   44   30    2    2   15
    5    1  603   

In [9]:
# model_train.load_weights(result_path + "14m_fse19_v11.hdf5") #TrainingDataAllTokens + NegativeItems
# model_train.load_weights(result_path + "14m_fse19_v10.hdf5") #TrainingDataLT94Tokens
model_train.load_weights(result_path + "14m_fse19_v9.hdf5") #TrainingDataAllTokens
model_train.summary()


Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 128)    3840000     encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_gru1 (GRU)              (None, None, 512)    986112      encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
_______________________________________________________________________________________

In [10]:
def trans(input_text):
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)

    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0, count_tokens] = token_int

        x_data = \
            {
                'decoder_initial_state': initial_state,
                'decoder_input': decoder_input_data
            }

        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]

        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]

    return output_text


print(trans(input_text=data_src[idx]))

 add eeee


In [12]:
import time

# test_path = '/home1/likejun/BadNameDetectionICSE2020/test_noreal_more_46_0228/consistent/'
# test_path = '/home1/likejun/BadNameDetectionICSE2020/test_noreal_more_46_0228/inconsistent/'
# test_path = '/home1/likejun/BadNameDetectionICSE2020/test_real_more_46_0228/inconsistent/'
test_path = '/home1/likejun/BadNameDetectionICSE2020/test_real_more_46_0228/consistent/'

all_srcs_test = []

path = test_path + 'parsedMethodContextTokens.txt'
with open(path) as file:
    texts = [line.strip() for line in file]
    all_srcs_test.extend(texts)
print(len(all_srcs_test))
name_list = []
start = time.time()
with open(test_path + 'result_TrainingDataAllTokens.txt', 'w') as f:
    for body in all_srcs_test:
        name = trans(input_text=body)
        f.write(name.replace(" eeee", "") + '\n')
        name_list.append(1)
        if len(name_list) % 1000 == 0:
            end = time.time()
            print(end - start)
            start = end

174944
154.95821046829224
152.38123965263367
148.2330687046051
150.30725622177124
149.3212730884552
143.06449341773987
148.83353757858276
147.36180591583252
147.3600218296051
149.93777132034302
145.3573019504547
156.3977484703064
156.72824382781982
149.26941895484924
156.6216962337494
157.04451942443848
155.69642806053162
153.31396222114563
155.64004921913147
150.0942883491516
144.59273838996887
145.64209055900574
138.30864000320435
144.34853506088257
150.1441740989685
148.0886402130127
149.35336637496948
155.14512991905212
151.85818767547607
149.4592661857605
145.37200617790222
141.30109310150146
132.75192070007324
141.29074358940125
143.74160361289978
144.10642266273499
132.3285253047943
144.394366979599
154.7309215068817
146.8138563632965
141.64062595367432
145.3820059299469
150.12429070472717
146.4537718296051
157.56516981124878
156.88175988197327
153.9599528312683
152.94550204277039
151.6874921321869
143.14954280853271
158.46457529067993
150.01848006248474
151.64932250976562
151.3