## Code Completion System

This is a JavaScript Code Prediction System

In [1]:
import json
import random
import tensorflow as tf
import numpy as np
import tflearn
import os

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
train_dir = 'dataset/programs_800/'
query_dir = 'dataset/programs_200/'
model_file = 'trained_model_parameter'

#### Load tokens from files

In [3]:
def load_tokens(token_dir, is_simplify=True):
    '''
    load token sequence data from input path: token_dir.
    is_simplify: whether or not simplify the value of some variable type(see function for detail)
    return a list whose elements are lists of a token sequence
    '''
    token_files = [] #stored the file's path which ends with 'tokens.json' 
    for f in os.listdir(token_dir):
        file_path = os.path.join(token_dir, f)
        if os.path.isfile(file_path) and f.endswith('_tokens.json'):
            token_files.append(file_path)
            
   #load to a list, element is a token sequence of source code         
    token_lists = [json.load(open(f, encoding='utf-8')) for f in token_files]
    def simplify_token(token):
        '''
        Because there are too many values for type: "Identifier", "String", "Numeric",
        NN may be diffcult to train because of these different value. 
        So this function can transform these types of variables to a common value
        '''
        if token['type'] == 'Identifier':
            token['value'] = 'id'
        elif token['type'] == 'Numeric':
            token['value'] = '1'
        elif token['type'] == 'String':
            token['value'] = 'string'
        else:
            pass
    if is_simplify:
        for token_sequence in token_lists:
            for token in token_sequence:
                simplify_token(token)
    else:
        pass        
    
    return token_lists

#### Machine Learning Model

In [4]:
import time

class Code_Completion_Model:
    '''
    Machine Learning model class, including data processing, encoding, model_building, 
    training, query_testing, model_save, model_load
    '''
    def __init__(self, token_lists):
        '''
        Initialize ML model with training data
        token_lists: [[{type:.., value:..},{..},{..}], [..], [..]]
        '''
        time_begin = time.time()
        self.token_lists = token_lists
        self.tokens_set = set()
        for token_sequence in token_lists:
            for token in token_sequence:
                self.tokens_set.add(self.token_to_string(token))
        self.tokens_list = list(self.tokens_set)
        self.tokens_list.sort()
        self.tokens_size = len(self.tokens_set) #213
        self.index_to_string = {i:s for i, s in enumerate(self.tokens_list)}
        self.string_to_index = {s:i for i, s in enumerate(self.tokens_list)}
        time_end =time.time()
        print('model initialization time cost: ', time_end - time_begin)
    
        
    #data processing functions
    def token_to_string(self, token):
        return token['type'] + '~$$~' + token['value']
    def string_to_token(self, string):
        tokens = string.split('~$$~')
        return {'type':tokens[0], 'value':tokens[1]}
    
    #encoding token sequence as one_hot_encoding
    def one_hot_encoding(self,string):
        vector = [0] * self.tokens_size
        vector[self.string_to_index[string]] = 1
        return vector
    
    #generate X_train data and y_label for ML model
    def data_processing(self):
        '''
        first, transform a token in dict form to a type-value string
        x_data is a token, y_label is the previous token of x_data
        '''
        x_data = []
        y_data = []
        print('data processing is begining...')
        for token_sequence in self.token_lists:#token_sequence of each source code
            for index, token in enumerate(token_sequence):#each token(type_value) in source code
                if index > 0:
                    token_string = self.token_to_string(token)
                    prev_token = self.token_to_string(token_sequence[index - 1])
                    x_data.append(self.one_hot_encoding(prev_token))
                    y_data.append(self.one_hot_encoding(token_string))
        print('data processing is finished..')
        return x_data, y_data
    
    #neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.nn = tflearn.input_data(shape=[None, self.tokens_size])
        self.nn = tflearn.fully_connected(self.nn, 128, activation='')
        self.nn = tflearn.fully_connected(self.nn, 128)
        self.nn = tflearn.fully_connected(self.nn, self.tokens_size, activation='softmax')
        self.nn = tflearn.regression(self.nn)
        self.model = tflearn.DNN(self.nn)
    
    #load trained model into object
    def load_model(self, model_file):
        self.create_NN()
        self.model.load(model_file)
    
    #training ML model
    def train(self):
        time_begin = time.time()
        x_data, y_data = self.data_processing()
        time_end = time.time()
        print('data processing time cost: ', time_end - time_begin)
        self.create_NN()
        time_begin = time.time()
        self.model.fit(x_data, y_data, n_epoch=1, batch_size=500, show_metric = True)
        time_end = time.time()
        print('training time cost: ', time_end - time_begin)
        return time_end - time_begin
        
    #save trained model to model path
    def save_model(self, model_file):
        self.model.save(model_file)
        
    #query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        In this function, use only one token before hole token to predict
        return: the most probable token 
        '''
        prev_token_string = self.token_to_string(prefix[-1])
        x = self.one_hot_encoding(prev_token_string)
        y = self.model.predict([x])
        predicted_seq = y[0]
        if type(predicted_seq) is np.ndarray:
            predicted_seq = predicted_seq.tolist()
        best_number = predicted_seq.index(max(predicted_seq))
        best_string = self.index_to_string[best_number]
        best_token = self.string_to_token(best_string)
        return [best_token]
        

#### Training model

In [None]:
start_time = time.time()
dataset = load_tokens(train_dir)
code_completion = Code_Completion_Model(dataset)
use_stored_model = False
if use_stored_model:
    code_completion.load_model(model_file)
else:
    train_time = code_completion.train()
    code_completion.save_model(model_file)
    
end_time = time.time()
print('total time cost: %.2f s, model training cost: %.2f s'%(end_time-start_time, train_time))

data processing is finished..


#### Query test

In [6]:
def create_hole(tokens, max_hole_size = 2):
    '''
    input: a tokens sequence of source code and max_hole_size
    return: hole token to be predicted (expection)
            token sequence before the hole(prefix)
            token sequence after the hole(suffix)
    '''
    hole_size = min(random.randint(1, max_hole_size), len(tokens) - 1)
    hole_start_index = random.randint(1, len(tokens) - hole_size)
    hole_end_index = hole_start_index + hole_size
    prefix = tokens[0 : hole_start_index]
    expection = tokens[hole_start_index : hole_end_index]
    suffix = tokens[hole_end_index : 0]
    return prefix, expection, suffix

def token_equals(token1, token2):
    '''
    Determining whether input two tokens are equal or not
    '''
    if len(token1) != len(token2):
        return False
    for index, t1 in enumerate(token1):
        t2 = token2[index]
        if t1['type'] != t2['type'] or t1['value'] != t2['value']:
            return False
    return True

In [7]:
query_test_data = load_tokens(query_dir)
correct = 0
correct_token_list = []
incorrect_token_list = []
for tokens in query_test_data:
    prefix, expection, suffix = create_hole(tokens)
    prediction = code_completion.query_test(prefix, suffix)
    if token_equals(prediction, expection):
        correct += 1
        correct_token_list.append({'expection':expection, 'prediction':prediction})
    else:
        incorrect_token_list.append({'expection':expection, 'prediction':prediction})
accuracy = correct / len(query_test_data)
print('query test accuracy: ', accuracy)

query test accuracy:  0.16


In [None]:
print('correct_token_list: \n', correct_token_list[:5])
print('incorrect_token_list: \n', incorrect_token_list[:5])

# Test Module
optimization idea:
- re-implement dnn model with tensorflow(not tflearn)
- using embedding method rather thant one_hot_encoding
- using a deeper and wider network
- using LSTM
- training model not with only one previous token, severl tokens? and following tokens?
- try CNN
- see each source code file as a training batch, do not combine them as a huge training data(for RNN)

In [45]:
import json
import random
import tensorflow as tf
import numpy as np
import tflearn
import os
import pickle
import time


train_dir = 'dataset/programs_800/'
query_dir = 'dataset/programs_200/'
model_file = 'trained_model_parameter'

epoch_num = 1
batch_size = 64
learning_rate = 0.01


def load_tokens(token_dir, is_simplify=True):
    '''
    load token sequence data from input path: token_dir.
    is_simplify: whether or not simplify the value of some variable type(see function for detail)
    return a list whose elements are lists of a token sequence
    '''
    token_files = []  # stored the file's path which ends with 'tokens.json'
    for f in os.listdir(token_dir):
        file_path = os.path.join(token_dir, f)
        if os.path.isfile(file_path) and f.endswith('_tokens.json'):
            token_files.append(file_path)

    # load to a list, element is a token sequence of source code
    token_lists = [json.load(open(f, encoding='utf-8')) for f in token_files]

    def simplify_token(token):
        '''
        Because there are too many values for type: "Identifier", "String", "Numeric",
        NN may be diffcult to train because of these different value.
        So this function can transform these types of variables to a common value
        '''
        if token['type'] == 'Identifier':
            token['value'] = 'id'
        elif token['type'] == 'Numeric':
            token['value'] = '1'
        elif token['type'] == 'String':
            token['value'] = 'string'
        else:
            pass

    if is_simplify:
        for token_sequence in token_lists:
            for token in token_sequence:
                simplify_token(token)
    else:
        pass

    return token_lists





class Code_Completion_Model:
    '''
    Machine Learning model class, including data processing, encoding, model_building,
    training, query_testing, model_save, model_load
    '''

    def __init__(self, token_lists):
        '''
        Initialize ML model with training data
        token_lists: [[{type:.., value:..},{..},{..}], [..], [..]]
        '''
        time_begin = time.time()
        self.token_lists = token_lists
        self.tokens_set = set()
        for token_sequence in token_lists:
            for token in token_sequence:
                self.tokens_set.add(self.token_to_string(token))
        self.tokens_list = list(self.tokens_set)
        self.tokens_list.sort()
        self.tokens_size = len(self.tokens_set)  # 213
        self.index_to_string = {i: s for i, s in enumerate(self.tokens_list)}
        self.string_to_index = {s: i for i, s in enumerate(self.tokens_list)}
        time_end = time.time()
        print('model initialization time cost: ', time_end - time_begin)

    # data processing functions
    def token_to_string(self, token):
        return token['type'] + '~$$~' + token['value']

    def string_to_token(self, string):
        tokens = string.split('~$$~')
        return {'type': tokens[0], 'value': tokens[1]}

    # encoding token sequence as one_hot_encoding
    def one_hot_encoding(self, string):
        vector = [0] * self.tokens_size
        vector[self.string_to_index[string]] = 1
        return vector

    # generate X_train data and y_label for ML model
    def data_processing(self):
        '''
        first, transform a token in dict form to a type-value string
        x_data is a token, y_label is the previous token of x_data
        '''
        x_data = []
        y_data = []
        print('data processing is begining...')
        for token_sequence in self.token_lists:  # token_sequence of each source code
            for index, token in enumerate(token_sequence):  # each token(type_value) in source code
                if index > 0:
                    token_string = self.token_to_string(token)
                    prev_token = self.token_to_string(token_sequence[index - 1])
                    x_data.append(self.one_hot_encoding(prev_token))
                    y_data.append(self.one_hot_encoding(token_string))
        print('data processing is finished..')
        pickle.dump((x_data, y_data), open('processed_data/saved_data_for_basic.p', 'wb'))
        return x_data, y_data

    # neural network functions
    def create_NN(self):
        tf.reset_default_graph()
#         self.input_x = tf.layers.Input(shape=[self.tokens_size])
#         self.output_y = tf.layers.Input(shape=[self.tokens_size])
        self.input_x = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size])
        self.output_y = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size])
        self.nn = tf.layers.dense(inputs=self.input_x, units=128, activation=tf.nn.relu)
        self.output = tf.layers.dense(inputs=self.nn, units=self.tokens_size, activation=None)
        self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.output_y)
        self.loss = tf.reduce_sum(self.loss)
        self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.loss)
        self.equal = tf.equal(tf.argmax(self.output_y,1), tf.argmax(self.output, 1))
        self.accuarcy = tf.reduce_mean(tf.cast(self.equal, tf.float32))


    # training ML model
    def train(self, use_saved_data=False):
        time_begin = time.time()
        if use_saved_data:
            x_data, y_data = pickle.load(open('processed_data/saved_data_for_basic.p', 'rb'))
        else:
            x_data, y_data = self.data_processing()
            
        time_end = time.time()
        print('data processing time cost: ', time_end - time_begin)
        self.create_NN()
        time_begin = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in range(epoch_num):
                for i in range(0, len(x_data), batch_size):
                    batch_x = x_data[i:i+batch_size]
                    batch_y = y_data[i:i+batch_size]
                    feed = {self.input_x:batch_x, self.output_y:batch_y}
                    sess.run(self.optimizer, feed_dict=feed)
                    if (i//batch_size) % 500 == 0:
                        show_acc = sess.run(self.accuarcy, feed_dict=feed)
                        print('epoch: %d, training_step: %d, accuracy:%.3f'%(epoch, i, show_acc))

        time_end = time.time()
        print('training time cost: ', time_end - time_begin)
        return time_end - time_begin



    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        In this function, use only one token before hole token to predict
        return: the most probable token
        '''
        prev_token_string = self.token_to_string(prefix[-1])
        x = self.one_hot_encoding(prev_token_string)
        with tf.Session() as sess:
            feed = {self.input_x:x}
            predict_list = sess.run(self.output, feed_dict=feed)
            prediction = tf.argmax(predict_list, 1)
            best_string = self.index_to_string[prediction]
            best_token = self.string_to_token(best_string)
        return [best_token]

In [47]:
dataset = load_tokens(train_dir)

In [48]:
start_time = time.time()

code_completion = Code_Completion_Model(dataset)
use_stored_model = False

train_time = code_completion.train()

model initialization time cost:  0.8267886638641357
data processing is begining...
data processing is finished..


OSError: [Errno 22] Invalid argument

In [None]:
import tensorflow as tf
import numpy as np
import tflearn
import random


import data_utils


train_dir = 'dataset/programs_800/'
query_dir = 'dataset/programs_200/'
model_dir = 'saved_model/model_parameter'


class Code_Completion_Model:
    '''
    Machine Learning model class, including data processing, encoding, model_building,
    training, query_testing, model_save, model_load
    '''

    def __init__(self):
        self.string_to_index, self.index_to_string, token_set = \
            data_utils.load_data_with_pickle('processed_data/train_parameter.p')
        self.num_token = len(token_set)

        
    def init_with_orig_data(self, token_lists):
        '''
        Initialize ML model with training data
        token_lists: [[{type:.., value:..},{..},{..}], [..], [..]]
        '''
        self.dataset = token_lists
        self.tokens_set = data_utils.get_token_set(self.dataset)
        self.num_tokens = len(self.tokens_set)  # 74 经过简化后只有74种token
        print(self.num_tokens)
        # 构建映射字典
        self.index_to_string = {i: s for i, s in enumerate(self.tokens_set)}
        self.string_to_index = {s: i for i, s in enumerate(self.tokens_set)}

    # generate X_train data and y_label for ML model
    def data_processing(self):
        '''
        first, transform a token in dict form to a type-value string
        x_data is a token, y_label is the previous token of x_data
        '''
        x_data = []
        y_data = []
        for index, token in enumerate(self.dataset):
            if index > 0:
                token_string = data_utils.token_to_string(token)
                prev_token = data_utils.token_to_string(self.dataset[index - 1])
                x_data.append(self.one_hot_encoding(prev_token))
                y_data.append(self.one_hot_encoding(token_string))
        return x_data, y_data

    def vector_data_process(self,dataset):
        '''
        读取已经被处理成one_hot_vector的token data，该函数会根据该dataset
        构造x_data and y_data
        :param dataset:
        :return:
        '''
        x_data = []
        y_data = []
        for index, token in enumerate(dataset):
            if index > 0:
                x_data.append(dataset[index])
                y_data.append(token)
        return x_data, y_data



    # neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.nn = tflearn.input_data(shape=[None, self.num_token])
        self.nn = tflearn.fully_connected(self.nn, 128)
        self.nn = tflearn.fully_connected(self.nn, self.num_token, activation='softmax')
        self.nn = tflearn.regression(self.nn)
        self.model = tflearn.DNN(self.nn)

    # load trained model into object
    def load_model(self, model_file):
        self.create_NN()
        self.model.load(model_file)

    # training ML model
    def train(self, train_data, with_original_data=False):
        print('model training...')
        if with_original_data:
            self.init_with_orig_data(train_data)
            x_data, y_data = self.data_processing()
            self.create_NN()
            self.model.fit(x_data, y_data, n_epoch=1, batch_size=500, show_metric=True)
        else:
            x_data, y_data = self.vector_data_process(train_data)
            self.create_NN()
            self.model.fit(
                x_data, y_data, n_epoch=1, validation_set=0.2, batch_size=500, show_metric=True)

    # save trained model to model path
    def save_model(self, model_file):
        self.model.save(model_file)

    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        In this function, use only one token before hole token to predict
        return: the most probable token
        '''
        prev_token_string = data_utils.token_to_string(prefix[-1])
        x = data_utils.one_hot_encoding(prev_token_string, self.string_to_index)
        y = self.model.predict([x])
        predicted_seq = y[0]
        if type(predicted_seq) is np.ndarray:
            predicted_seq = predicted_seq.tolist()
        best_number = predicted_seq.index(max(predicted_seq))
        print('prediction:', best_number)
        best_string = self.index_to_string[best_number]
        best_token = data_utils.string_to_token(best_string)
        return [best_token]

    

    
    

    def test_model(self, query_test_data):
        correct = 0
        correct_token_list = []
        incorrect_token_list = []
        for tokens in query_test_data:
            prefix, expection, suffix = data_utils.create_hole(tokens)
            prediction = self.query_test(prefix, suffix)

            if data_utils.token_equals(prediction, expection):
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(query_test_data)
        return accuracy


In [None]:
processed_data = data_utils.load_data_with_pickle('processed_data/vec_train_data.p')

In [None]:
#data load and model create
cc_model = Code_Completion_Model()
#training model
use_stored_model = False
if use_stored_model:
    cc_model.load_model(model_dir)
else:
    cc_model.train(processed_data, with_original_data=False)

In [None]:
query_test_data = data_utils.load_data_with_file(query_dir)

In [None]:
def query_test_haha(query_test_data):
    '''
    Input: all tokens before the hole token(prefix) and all tokens after the hole token,
    ML model will predict the most probable token in the hole
    In this function, use only one token before hole token to predict
    return: the most probable token
    '''
    correct = 0
    correct_token_list = []
    incorrect_token_list = []
    for tokens in query_test_data:
        prefix, expection, suffix = data_utils.create_hole(tokens)
        prediction = cc_model.query_test(prefix, suffix)
        
        strring = data_utils.token_to_string(expection[0])
        #print(strring)
        index_num = cc_model.string_to_index[strring]
        print('expection:', index_num)
        print('\n')
        if data_utils.token_equals(prediction, expection):
            correct += 1
            correct_token_list.append({'expection': expection, 'prediction': prediction})
        else:
            incorrect_token_list.append({'expection': expection, 'prediction': prediction})
    accuracy = correct / len(query_test_data)
    print(accuracy)
    return accuracy

In [2]:
import json
import random
import tensorflow as tf
import numpy as np
import tflearn
import os
import pickle
import time
import data_utils

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


In [102]:

'''
使用TensorFlow自带的layers构建基本的神经网络对token进行预测，
可以声明使用多少个context tokens 进行预测

多个previous token输入神经网络的方法有两种想法：
1. 将每个token的representation vector相连，合成一个大的vector输入到神经网络，
    所以说神经网络的输入层大小应为：每个token vector length * number of previous token
2. 应为目前表示每个token 使用的方法为one hot encoding，也就是说对每个token都是有且仅有一位为1，其余位为0，
    所以可以考虑直接将所有的previous token相加，这样做的好处是NN输入层大小永远等于vector length。缺点是没有理论依据，不知道效果是否会更好


1. concatenate the representations of previous tokens to a huge vector representation
2. add the representations of previous tokens together


'''






x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
query_dir = 'dataset/programs_200/'

epoch_num = 1
batch_size = 64
learning_rate = 0.002
previous_token_num = 2


class Code_Completion_Model:

    def __init__(self, x_data, y_data, token_set, string2int, int2string):
        batch_num = len(x_data) // batch_size
        self.x_data = x_data[:batch_num * batch_size]
        self.y_data = y_data[:batch_num * batch_size]
        self.index_to_string = int2string
        self.string_to_index = string2int
        self.tokens_set = token_set
        self.tokens_size = len(token_set)

    # neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.input_x = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='input_x')
        self.output_y = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='output_y')
        self.nn = tf.layers.dense(inputs=self.input_x, units=128, activation=tf.nn.relu, name='hidden_1')
        self.output = tf.layers.dense(inputs=self.nn, units=self.tokens_size, activation=None, name='prediction')
        self.prediction_index = tf.argmax(self.output, 1)
        self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.output, labels=self.output_y, name='loss')
        self.loss = tf.reduce_sum(self.loss)
        self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.loss)
        self.equal = tf.equal(tf.argmax(self.output_y, 1), tf.argmax(self.output, 1))
        self.accuarcy = tf.reduce_mean(tf.cast(self.equal, tf.float32), name='accuracy')

    def get_batch(self, context_size = previous_token_num):
        
        x_data = np.array(self.x_data)
        for i in range(0, len(self.x_data), batch_size):
            batch_x = np.zeros((batch_size, self.tokens_size))
            for j in range(context_size):
                if i >= j:
                    temp = x_data[i-j:i-j+batch_size].reshape(-1, self.tokens_size)
                    if temp.shape == (0, 86): break;
                    batch_x += temp
            batch_y = self.y_data[i:i + batch_size]
            yield batch_x, batch_y

    def train(self):
        self.create_NN()
        self.sess = tf.Session()
        time_begin = time.time()
        self.sess.run(tf.global_variables_initializer())
        batch_generator = self.get_batch()
        for epoch in range(epoch_num):
            for i in range(0, len(self.x_data), batch_size):
                batch_x, batch_y = next(batch_generator)
                feed = {self.input_x: batch_x, self.output_y: batch_y}
                self.sess.run(self.optimizer, feed_dict=feed)
                if (i // batch_size) % 2000 == 0:
                    show_loss, show_acc = self.sess.run([self.loss, self.accuarcy], feed_dict=feed)
                    print('epoch: %d, training_step: %d, loss: %.2f, accuracy:%.3f' % (epoch, i, show_loss, show_acc))
        time_end = time.time()
        print('training time cost: %.3f s'%(time_end - time_begin))


    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole. In this function, use only one token before hole token to predict
        '''
        previous_token_list = prefix[-previous_token_num:]
        context_representation = np.zeros(self.tokens_size)

        for token in previous_token_list:
            prev_token_string = data_utils.token_to_string(token)
            pre_token_x = data_utils.one_hot_encoding(prev_token_string, self.string_to_index)
            context_representation += np.array(pre_token_x)

        feed = {self.input_x: [context_representation]}
        prediction = self.sess.run(self.prediction_index, feed)[0]
        best_string = self.index_to_string[prediction]
        best_token = data_utils.string_to_token(best_string)
        return [best_token]

    #test model
    def test_model(self, query_test_data):
        correct = 0.0
        correct_token_list = []
        incorrect_token_list = []
        for token_sequence in query_test_data:
            prefix, expection, suffix = data_utils.create_hole(token_sequence)
            prediction = self.query_test(prefix, suffix)[0]
            if data_utils.token_equals([prediction], expection):
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(query_test_data)
        return accuracy


In [5]:
x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
x_data = data_utils.load_data_with_pickle(x_train_data_path)
y_data = data_utils.load_data_with_pickle(y_train_data_path)
token_set, string2int, int2string = data_utils.load_data_with_pickle(train_data_parameter)


In [103]:
#model train
model = Code_Completion_Model(x_data, y_data, token_set, string2int, int2string)
model.train()

epoch: 0, training_step: 0, loss: 276.29, accuracy:0.172
epoch: 0, training_step: 128000, loss: 89.34, accuracy:0.562
epoch: 0, training_step: 256000, loss: 82.85, accuracy:0.531
epoch: 0, training_step: 384000, loss: 91.51, accuracy:0.547
epoch: 0, training_step: 512000, loss: 110.68, accuracy:0.391
epoch: 0, training_step: 640000, loss: 97.37, accuracy:0.469
epoch: 0, training_step: 768000, loss: 35.40, accuracy:0.719
epoch: 0, training_step: 896000, loss: 127.04, accuracy:0.500
epoch: 0, training_step: 1024000, loss: 57.99, accuracy:0.625
epoch: 0, training_step: 1152000, loss: 66.93, accuracy:0.672
epoch: 0, training_step: 1280000, loss: 128.33, accuracy:0.344
epoch: 0, training_step: 1408000, loss: 98.13, accuracy:0.484
epoch: 0, training_step: 1536000, loss: 130.28, accuracy:0.391
epoch: 0, training_step: 1664000, loss: 108.26, accuracy:0.391
training time cost:  51.472346782684326


In [None]:
# test model
query_test_data = data_utils.load_data_with_file(query_dir)
accuracy = model.test_model(query_test_data)
print('query test accuracy: ', accuracy)

In [88]:
data = np.arange(12).reshape(6,2)
aa = np.array([-2,-2])

In [89]:
data += aa

In [95]:
if aa.shape == (2,):
    print("a")

a


In [90]:
data

array([[-2, -1],
       [ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9]])

In [61]:
data

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [69]:
for i in range(3):
    bb = 2
    batch = np.zeros((2, 2))
    for j in range(2):
        if (i>=j):
            temp = data[i-j:i+bb-j]
            batch += temp
    print(batch)


[[0. 1.]
 [2. 3.]]
[[2. 4.]
 [6. 8.]]
[[ 6.  8.]
 [10. 12.]]
