## Code Completion System

This is a JavaScript Code Prediction System

In [6]:
import tensorflow as tf
import numpy as np
import pickle
import time
import data_utils

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



'''
使用TensorFlow自带的layers构建基本的神经网络对token进行预测，
可以声明使用多少个context tokens 进行预测

多个previous token输入神经网络的方法有两种想法：
1. 将每个token的representation vector相连，合成一个大的vector输入到神经网络，
    所以说神经网络的输入层大小应为：每个token vector length * number of previous token
2. 应为目前表示每个token 使用的方法为one hot encoding，也就是说对每个token都是有且仅有一位为1，其余位为0，
    所以可以考虑直接将所有的previous token相加，这样做的好处是NN输入层大小永远等于vector length。缺点是没有理论依据，不知道效果是否会更好


1. concatenate the representations of previous tokens to a huge vector representation
2. add the representations of previous tokens together


'''

x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
query_dir = 'dataset/programs_200/'

tensorboard_data_path = './logs/MultiContext/5_previous'

epoch_num = 5
batch_size = 64
learning_rate = 0.002
context_size = 5
hidden_size = 64

In [7]:


class Code_Completion_Model:

    def __init__(self, x_data, y_data, token_set, string2int, int2string):
        batch_num = len(x_data) // batch_size
        x_data, y_data = np.array(x_data[:batch_num * batch_size]), np.array(y_data[:batch_num * batch_size])
        self.reshape_data(x_data, y_data)
        self.x_data, self.valid_x, self.y_data, self.valid_y = \
            train_test_split(x_data, y_data, train_size=0.9)
        self.data_size = len(self.x_data)
        self.index_to_string = int2string
        self.string_to_index = string2int
        self.tokens_set = token_set
        self.tokens_size = len(token_set)

    def reshape_data(self, x_data, y_data):
        x = []
        y = []
        for index, token in enumerate(x_data):
            if index >= context_size - 1:
                tempTokens = np.sum(x_data[index - context_size + 1:index + 1, :], axis=0)
                x.append(tempTokens)
                y.append(y_data[index])
        return x, y;

    # neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.input_x = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='input_x')
        self.output_y = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='output_y')
        weights = {'h1': tf.Variable(tf.truncated_normal(shape=[self.tokens_size, hidden_size])),
                   'h2': tf.Variable(tf.truncated_normal(shape=[hidden_size, hidden_size])),
                   'h3': tf.Variable(tf.truncated_normal(shape=[hidden_size, hidden_size])),
                   'output': tf.Variable(tf.truncated_normal(shape=[hidden_size, self.tokens_size]))}
        biases = {'h1': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'h2': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'h3': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'output': tf.Variable(tf.constant(0.1, shape=[self.tokens_size], dtype=tf.float32))}

        h1_layer = tf.matmul(self.input_x, weights['h1']) + biases['h1']
        h1_layer = tf.nn.relu(h1_layer)
        h2_layer = tf.matmul(h1_layer, weights['h2']) + biases['h2']
        h2_layer = tf.nn.relu(h2_layer)
        h3_layer = tf.matmul(h2_layer, weights['h3']) + biases['h3']
        h3_layer = tf.nn.relu(h3_layer)
        output_layer = tf.matmul(h3_layer, weights['output']) + biases['output']
        self.prediction = tf.argmax(output_layer, 1)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=self.output_y)
        self.loss = tf.reduce_mean(loss)
        self.optimizer_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        equal = tf.equal(tf.argmax(output_layer, 1), tf.argmax(self.output_y, 1))
        accuracy = tf.cast(equal, tf.float32)
        self.accuracy = tf.reduce_mean(accuracy)
        
#         self.valid_loss = tf.reduce_mean(loss)
#         self.valid_accuracy = tf.reduce_mean(accuracy)

        tf.summary.histogram('weight1', weights['h1'])
        tf.summary.histogram('weight2', weights['h2'])
        tf.summary.histogram('output_weight', weights['output'])
        tf.summary.histogram('bias1', biases['h1'])
        tf.summary.histogram('bias2', biases['h2'])
        tf.summary.histogram('output_bias', biases['output'])
        tf.summary.scalar('train_loss', self.loss)
        tf.summary.scalar('train_accuracy', self.accuracy)
         
        self.merged = tf.summary.merge_all()

    def get_batch(self):
        for i in range(0, len(self.x_data), batch_size):
            batch_x = self.x_data[i:i + batch_size];
            batch_y = self.y_data[i:i + batch_size];
            yield batch_x, batch_y

    def train(self):
        self.create_NN()
        self.sess = tf.Session()
        valid_accu_list = np.zeros(10, dtype=np.float32)
        train_accu_list = np.zeros(10, dtye=np.float32)
        valid_list_index = 0
        train_list_index = 0
        writer = tf.summary.FileWriter(tensorboard_data_path, self.sess.graph)
        time_begin = time.time()
        self.sess.run(tf.global_variables_initializer())
        for epoch in range(epoch_num):
          #  self.x_data, self.y_data = shuffle(self.x_data, self.y_data)
            batch_generator = self.get_batch()
            for i in range(0, len(self.x_data), batch_size):
                batch_x, batch_y = next(batch_generator)
                feed = {self.input_x: batch_x, self.output_y: batch_y}
                _, summary_str = self.sess.run([self.optimizer_op, self.merged], feed_dict=feed)
                writer.add_summary(summary_str, epoch*self.data_size + i)
                writer.flush()
                if (i // batch_size) % 2000 == 0:
                    print('epoch: %d, step: %d'%(epoch, i))
                    train_loss, train_accu = self.sess.run([self.loss, self.accuracy], feed_dict=feed)
                    train_accu_list[train_list_index % 10] = train_accu
                    print('train loss: %.2f, train accuracy:%.3f' % (train_loss, train_accu))
                    print('average train accuracy: %.4f'%(np.mean(train_accu_list)))
                    valid_feed = {self.input_x:self.valid_x, self.output_y:self.valid_y}
                    valid_loss, valid_acc = self.sess.run([self.loss, self.accuracy], feed_dict=valid_feed)
                    valid_accu_list[valid_list_index % 10] = valid_acc
                    print('valid loss: %.2f, valid accuracy:%.3f' % (valid_loss, valid_acc))
                    print('average valid accuracy: %.4f'%(np.mean(valid_accu_list)))
        time_end = time.time()
        print('training time cost: %.3f s' % (time_end - time_begin))

    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole. In this function, use only one token before hole token to predict
        '''
        previous_token_list = prefix[-context_size:]
        context_representation = np.zeros(self.tokens_size)

        for token in previous_token_list:
            prev_token_string = data_utils.token_to_string(token)
            pre_token_x = data_utils.one_hot_encoding(prev_token_string, self.string_to_index)
            context_representation += np.array(pre_token_x)

        feed = {self.input_x: [context_representation]}
        prediction = self.sess.run(self.prediction, feed)[0]
        best_string = self.index_to_string[prediction]
        best_token = data_utils.string_to_token(best_string)
        return [best_token]

    # test model
    def test_model(self, query_test_data):
        correct = 0.0
        correct_token_list = []
        incorrect_token_list = []
        for token_sequence in query_test_data:
            prefix, expection, suffix = data_utils.create_hole(token_sequence)
            prediction = self.query_test(prefix, suffix)[0]
            if data_utils.token_equals([prediction], expection):
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(query_test_data)
        return accuracy

In [8]:
x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
x_data = data_utils.load_data_with_pickle(x_train_data_path)
y_data = data_utils.load_data_with_pickle(y_train_data_path)
token_set, string2int, int2string = data_utils.load_data_with_pickle(train_data_parameter)

In [9]:
#model train
model = Code_Completion_Model(x_data, y_data, token_set, string2int, int2string)
model.train()



epoch: 0, step: 0, train loss: 364.78, train accuracy:0.000
average train accuracy: 0.00
epoch: 0, step: 0, valid loss: 365.05, valid accuracy:0.006
average valid accuracy: 0.00
epoch: 0, step: 128000, train loss: 2.15, train accuracy:0.406
average train accuracy: 0.04
epoch: 0, step: 128000, valid loss: 2.18, valid accuracy:0.430
average valid accuracy: 0.04
epoch: 0, step: 256000, train loss: 3.02, train accuracy:0.391
average train accuracy: 0.04
epoch: 0, step: 256000, valid loss: 1.99, valid accuracy:0.386
average valid accuracy: 0.04
epoch: 0, step: 384000, train loss: 2.07, train accuracy:0.375
average train accuracy: 0.04
epoch: 0, step: 384000, valid loss: 1.92, valid accuracy:0.428
average valid accuracy: 0.04
epoch: 0, step: 512000, train loss: 1.72, train accuracy:0.391
average train accuracy: 0.04
epoch: 0, step: 512000, valid loss: 1.89, valid accuracy:0.418
average valid accuracy: 0.04
epoch: 0, step: 640000, train loss: 1.63, train accuracy:0.422
average train accuracy:

epoch: 3, step: 768000, train loss: 1.65, train accuracy:0.484
average train accuracy: 0.05
epoch: 3, step: 768000, valid loss: 1.67, valid accuracy:0.455
average valid accuracy: 0.05
epoch: 3, step: 896000, train loss: 1.62, train accuracy:0.422
average train accuracy: 0.04
epoch: 3, step: 896000, valid loss: 1.67, valid accuracy:0.454
average valid accuracy: 0.05
epoch: 3, step: 1024000, train loss: 1.60, train accuracy:0.469
average train accuracy: 0.05
epoch: 3, step: 1024000, valid loss: 1.67, valid accuracy:0.455
average valid accuracy: 0.05
epoch: 3, step: 1152000, train loss: 1.93, train accuracy:0.422
average train accuracy: 0.04
epoch: 3, step: 1152000, valid loss: 1.67, valid accuracy:0.454
average valid accuracy: 0.05
epoch: 3, step: 1280000, train loss: 1.78, train accuracy:0.391
average train accuracy: 0.04
epoch: 3, step: 1280000, valid loss: 1.67, valid accuracy:0.455
average valid accuracy: 0.05
epoch: 3, step: 1408000, train loss: 1.54, train accuracy:0.422
average tr

In [10]:
# test model
query_test_data = data_utils.load_data_with_file(query_dir)
accuracy = model.test_model(query_test_data)
print('query test accuracy: ', accuracy)

query test accuracy:  0.225


In [169]:
import random
from collections import Counter
import data_utils
string_processed_data_path = 'processed_data/str_train_data.p'

class Markov_Model(object):

    def __init__(self, max_length=1, is_most=False):
        self.markov_table = {}
        self.max_length = 1
        self.is_most = False

    def create_model(self, token_list, max_depth=1, is_most=False):
        '''
        create a markov model with the depth from 1 to max_depth
        {
            depth1:{
                key1:[value1, value2 ..]
            }
        }
        '''
        self.is_most = is_most
        self.max_length = max_depth
        for depth in range(1, max_depth+1):
            temp_table = {}
            for index in range(depth, len(token_list)):
                words = tuple(token_list[index-depth:index])
                if words in temp_table.keys():
                    temp_table[words].append(token_list[index])
                else:
                    temp_table.setdefault(words, []).append(token_list[index])
            if is_most:
                for key,value in temp_table.items():
                    temp = Counter(value).most_common(1)[0][0]
                    temp_table[key] = temp
                self.markov_table[depth] = temp_table
            else:
                self.markov_table[depth] = temp_table
        return self.markov_table

    def test_model(self, test_token_lists, depth=1):
        correct = 0
        correct_token_list = []
        incorrect_token_list = []

        for tokens in test_token_lists:
            prefix, expection, suffix = data_utils.create_hole(tokens)
            prediction = self.query_test(prefix, depth=depth)
            if prediction['type']==expection[0]['type'] and prediction['value'] == expection[0]['value']:
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(test_token_lists)
        return accuracy


    def query_test(self, pre_tokens, depth=1):
        while(depth>self.max_length):
            depth -= 1
        used_tokens = pre_tokens[-depth:]
        proceed_tokens = []
        for token in used_tokens:
            proceed_tokens.append(data_utils.token_to_string(token))
        proceed_tokens = tuple(proceed_tokens)
        while proceed_tokens not in self.markov_table[depth].keys() and depth > 1:
            depth -= 1
            proceed_tokens = tuple(proceed_tokens[-depth:])

        if self.is_most:
            candidate = self.markov_table[depth][proceed_tokens]
        else:
            candidate_list = self.markov_table[depth][proceed_tokens]
            random_index = random.randint(0, len(candidate_list)-1)
            candidate = candidate_list[random_index]
        prediction = data_utils.string_to_token(candidate)
        return prediction

In [2]:
string_token_list = data_utils.load_data_with_pickle(string_processed_data_path)

In [173]:
markov_model = Markov_Model()
markov_table = markov_model.create_model(string_token_list, max_depth=6, is_most=True)

In [213]:
#print(markov_table[1].keys())
#print(markov_table[1][('Keyword~$$~var',)])
#print(markov_table[2].keys())
test_token_sequences = data_utils.load_data_with_file()
accuracy = 0.0
test_epoch = 10
for i in range(test_epoch):
    accuracy += markov_model.test_model(test_token_sequences, depth=6)
accuracy /= test_epoch
print(accuracy)

0.636


In [154]:
from collections import Counter
sentence = 'asgasdfasdfadsfadsfads'
con = Counter(sentence).most_common(1)[0][0]
print(type(con))
print(con)

<class 'str'>
a


In [158]:
mapp = {'1': 'abaaaaab', '2': 'dddddwwdwwdw'}
for key,value in mapp.items():
    temp = Counter(value).most_common(1)[0][0]
    mapp[key] = temp
print(mapp)

abaaaaab
a
dddddwwdwwdw
d
{'1': 'a', '2': 'd'}
