## Code Completion System

This is a JavaScript Code Prediction System

In [None]:
import json
import random
import tensorflow as tf
import numpy as np
import tflearn
import os

In [None]:
train_dir = 'dataset/programs_800/'
query_dir = 'dataset/programs_200/'
model_file = 'trained_model_parameter'

#### Load tokens from files

In [None]:
def load_tokens(token_dir, is_simplify=True):
    '''
    load token sequence data from input path: token_dir.
    is_simplify: whether or not simplify the value of some variable type(see function for detail)
    return a list whose elements are lists of a token sequence
    '''
    token_files = [] #stored the file's path which ends with 'tokens.json' 
    for f in os.listdir(token_dir):
        file_path = os.path.join(token_dir, f)
        if os.path.isfile(file_path) and f.endswith('_tokens.json'):
            token_files.append(file_path)
            
   #load to a list, element is a token sequence of source code         
    token_lists = [json.load(open(f, encoding='utf-8')) for f in token_files]
    def simplify_token(token):
        '''
        Because there are too many values for type: "Identifier", "String", "Numeric",
        NN may be diffcult to train because of these different value. 
        So this function can transform these types of variables to a common value
        '''
        if token['type'] == 'Identifier':
            token['value'] = 'id'
        elif token['type'] == 'Numeric':
            token['value'] = '1'
        elif token['type'] == 'String':
            token['value'] = 'string'
        else:
            pass
    if is_simplify:
        for token_sequence in token_lists:
            for token in token_sequence:
                simplify_token(token)
    else:
        pass        
    
    return token_lists

#### Machine Learning Model

In [None]:
import time

class Code_Completion_Model:
    '''
    Machine Learning model class, including data processing, encoding, model_building, 
    training, query_testing, model_save, model_load
    '''
    def __init__(self, token_lists):
        '''
        Initialize ML model with training data
        token_lists: [[{type:.., value:..},{..},{..}], [..], [..]]
        '''
        self.token_lists = token_lists
        self.tokens_set = set()
        for token_sequence in token_lists:
            for token in token_sequence:
                self.tokens_set.add(self.token_to_string(token))
        self.tokens_list = list(self.tokens_set)
        self.tokens_list.sort()
        self.tokens_size = len(self.tokens_set) #213
        self.index_to_string = {i:s for i, s in enumerate(self.tokens_list)}
        self.string_to_index = {s:i for i, s in enumerate(self.tokens_list)}
        
    #data processing functions
    def token_to_string(self, token):
        return token['type'] + '~$$~' + token['value']
    def string_to_token(self, string):
        tokens = string.split('~$$~')
        return {'type':tokens[0], 'value':tokens[1]}
    
    #encoding token sequence as one_hot_encoding
    def one_hot_encoding(self,string):
        vector = [0] * self.tokens_size
        vector[self.string_to_index[string]] = 1
        return vector
    
    #generate X_train data and y_label for ML model
    def data_processing(self):
        '''
        first, transform a token in dict form to a type-value string
        x_data is a token, y_label is the previous token of x_data
        '''
        x_data = []
        y_data = []
        for token_sequence in self.token_lists:#token_sequence of each source code
            for index, token in enumerate(token_sequence):#each token(type_value) in source code
                if index > 0:
                    token_string = self.token_to_string(token)
                    prev_token = self.token_to_string(token_sequence[index - 1])
                    x_data.append(self.one_hot_encoding(prev_token))
                    y_data.append(self.one_hot_encoding(token_string))
        print('data processing is finished..')
        return x_data, y_data
    
    #neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.nn = tflearn.input_data(shape=[None, self.tokens_size])
        self.nn = tflearn.fully_connected(self.nn, 128)
        self.nn = tflearn.fully_connected(self.nn, 128)
        self.nn = tflearn.fully_connected(self.nn, self.tokens_size, activation='softmax')
        self.nn = tflearn.regression(self.nn)
        self.model = tflearn.DNN(self.nn)
    
    #load trained model into object
    def load_model(self, model_file):
        self.create_NN()
        self.model.load(model_file)
    
    #training ML model
    def train(self):
        start_time = time.time()
        x_data, y_data = self.data_processing()
        self.create_NN()
        self.model.fit(x_data, y_data, n_epoch=1, batch_size=500, show_metric = True)
        end_time = time.time()
        return end_time - start_time
        
    #save trained model to model path
    def save_model(self, model_file):
        self.model.save(model_file)
        
    #query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        In this function, use only one token before hole token to predict
        return: the most probable token 
        '''
        prev_token_string = self.token_to_string(prefix[-1])
        x = self.one_hot_encoding(prev_token_string)
        y = self.model.predict([x])
        predicted_seq = y[0]
        if type(predicted_seq) is np.ndarray:
            predicted_seq = predicted_seq.tolist()
        best_number = predicted_seq.index(max(predicted_seq))
        best_string = self.index_to_string[best_number]
        best_token = self.string_to_token(best_string)
        return [best_token]
        

#### Training model

In [None]:
start_time = time.time()
dataset = load_tokens(train_dir)
code_completion = Code_Completion_Model(dataset)
use_stored_model = False
if use_stored_model:
    code_completion.load_model(model_file)
else:
    train_time = code_completion.train()
    code_completion.save_model(model_file)
    
end_time = time.time()
print('total time cost: %.2f s, model training cost: %.2f s'%(end_time-start_time, train_time))

#### Query test

In [None]:
def create_hole(tokens, max_hole_size = 2):
    '''
    input: a tokens sequence of source code and max_hole_size
    return: hole token to be predicted (expection)
            token sequence before the hole(prefix)
            token sequence after the hole(suffix)
    '''
    hole_size = min(random.randint(1, max_hole_size), len(tokens) - 1)
    hole_start_index = random.randint(1, len(tokens) - hole_size)
    hole_end_index = hole_start_index + hole_size
    prefix = tokens[0 : hole_start_index]
    expection = tokens[hole_start_index : hole_end_index]
    suffix = tokens[hole_end_index : 0]
    return prefix, expection, suffix

def token_equals(token1, token2):
    '''
    Determining whether input two tokens are equal or not
    '''
    if len(token1) != len(token2):
        return False
    for index, t1 in enumerate(token1):
        t2 = token2[index]
        if t1['type'] != t2['type'] or t1['value'] != t2['value']:
            return False
    return True

In [None]:
query_test_data = load_tokens(query_dir)
correct = 0
correct_token_list = []
incorrect_token_list = []
for tokens in query_test_data:
    prefix, expection, suffix = create_hole(tokens)
    prediction = code_completion.query_test(prefix, suffix)
    if token_equals(prediction, expection):
        correct += 1
        correct_token_list.append({'expection':expection, 'prediction':prediction})
    else:
        incorrect_token_list.append({'expection':expection, 'prediction':prediction})
accuracy = correct / len(query_test_data)
print('query test accuracy: ', accuracy)

In [None]:
print('correct_token_list: \n', correct_token_list[:5])
print('incorrect_token_list: \n', incorrect_token_list[:5])

# Test Module
optimization idea:
- re-implement dnn model with tensorflow(not tflearn)
- using embedding method rather thant one_hot_encoding
- using a deeper and wider network
- using LSTM
- training model not with only one previous token, severl tokens? and following tokens?
- try CNN
- see each source code file as a training batch, do not combine them as a huge training data(for RNN)

In [None]:
import tensorflow as tf
import numpy as np
import tflearn
import random


import load_data


train_dir = 'dataset/programs_800/'
query_dir = 'dataset/programs_200/'
model_dir = 'saved_model/sliding_window'


slide_windows = [1,2]
hidden_units = 128
epoch_num = 3
learning_rate = 0.001
batch_size = 256


class Code_Completion_Model:
    def __init__(self, token_lists):
        '''
        Initialize ML model with training data
        token_lists: [[{type:.., value:..},{..},{..}], [..], [..]]
        '''
        self.token_lists = token_lists
        self.tokens_set = set()
        for token_sequence in token_lists:
            for token in token_sequence:
                self.tokens_set.add(self.token_to_string(token))
        self.tokens_list = list(self.tokens_set)
        self.tokens_list.sort()
        self.tokens_num = len(self.tokens_set)  # 213
        self.index_to_string = {i: s for i, s in enumerate(self.tokens_list)}
        self.string_to_index = {s: i for i, s in enumerate(self.tokens_list)}

    # data processing functions
    def token_to_string(self, token):
        return token['type'] + '~$$~' + token['value']

    def string_to_token(self, string):
        tokens = string.split('~$$~')
        return {'type': tokens[0], 'value': tokens[1]}

    # encoding token sequence as one_hot_encoding
    def one_hot_encoding(self, string):
        vector = [0] * self.tokens_num
        vector[self.string_to_index[string]] = 1
        return vector

    # generate X_train data and y_label for ML model
    def data_processing(self):
        token_list = []
        for token_sequence in self.token_lists:  # token_sequence of each source code
            for index, token in enumerate(token_sequence):  # each token(type_value) in source code
                token = self.token_to_string(token)
                token_vec = self.one_hot_encoding(token)
                token_list.append(token_vec)
        return token_list

    def split_with_windows(self, token_list, window_size):
        #给定train_x, train_y list，元素由
        train_x = []
        train_y = []
        for index, token_vec in enumerate(token_list):
            if index > window_size - 1:
                prev_token_list = []
                for i in range(window_size):
                    prev_token = token_list[index-i-1]
                    prev_token_list.extend(prev_token)
                train_x.append(prev_token_list)
                train_y.append(token_vec)
        return train_x, train_y
    
    # neural network functions
    def create_NN(self, window_size):
        tf.reset_default_graph()
        graph = tf.Graph()
        with graph.as_default():
            input_x = tf.placeholder(tf.float32, [None, window_size * self.tokens_num],name='input_x')
            output_y = tf.placeholder(tf.float32, [None, self.tokens_num],name='output_y')

            fc1 = tf.layers.dense(
                inputs=input_x, units=window_size*hidden_units,
                activation=tf.nn.relu)
            fc2 = tf.layers.dense(
                inputs=fc1, units=window_size*hidden_units,
                activation=tf.nn.relu)
            fc3 = tf.layers.dense(
                inputs=fc2, units=window_size * hidden_units,
                activation=tf.nn.relu)
            output_layer = tf.layers.dense(
                inputs=fc3, units=self.tokens_num,activation=None,name='output_layer')

            loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=output_y)
            loss = tf.reduce_mean(loss, name='loss')
            optimizer = tf.train.AdamOptimizer(learning_rate, name='optimizer').minimize(loss)
            accuracy = tf.equal(tf.argmax(output_layer, 1), tf.argmax(output_y, 1))
            accuracy = tf.reduce_mean(tf.cast(accuracy, tf.float32), name='accuracy')
        
        return graph
      #  return input_x, output_y, output_layer, loss, optimizer, accuracy



        # load trained model into object
    def load_model(self, model_file):
        self.create_NN()
        self.model.load(model_file)

    # training ML model
    def train(self):
        print('model training...')
        model_list = []

        def get_batch(x_data, y_data):
            for i in range(0, len(x_data), batch_size):
                batch_x = x_data[i:i+batch_size]
                batch_y = y_data[i:i+batch_size]
                yield i//batch_size, batch_x, batch_y
                
        for window_size in slide_windows:
            graph = self.create_NN(window_size) 
            #model = [input_x, output_y, output_layer, loss, optimizer, accuracy]
            model_list.append(graph)
        
        token_list = self.data_processing()
        for window_size, graph in zip(slide_windows, model_list):
            print('with window_size: %d'%window_size)
            x_data, y_data = self.split_with_windows(token_list, window_size)
            with tf.Session(graph=graph) as sess:
                saver = tf.train.Saver()
                input_x = graph.get_tensor_by_name('input_x:0')
                output_y = graph.get_tensor_by_name('output_y:0')
                loss = graph.get_tensor_by_name('loss:0')
                optimizer = graph.get_operation_by_name('optimizer')
                accuracy = graph.get_tensor_by_name('accuracy:0')
                sess.run(tf.global_variables_initializer())
                
                for epoch in range(epoch_num):
                    geneator = get_batch(x_data, y_data)
                    for i, batch_x, batch_y in geneator:
                   #     batch_x, batch_y = next(geneator)
                        feed = {input_x:batch_x, output_y:batch_y}
                        sess.run(optimizer, feed)
                        if(i%300 == 0):
                            s_loss, s_accu = sess.run([loss, accuracy], feed)
                            print('epoch: %d, step: %d, loss: %.2f, accuracy:%.2f'%
                                  (epoch, i, s_loss, s_accu))
                saver.save(sess, model_dir + '_' + str(window_size) + '.ckpt.meta')
                    
                


    # query test
    def query_test(self, prefix, suffix,window_size):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        In this function, use only one token before hole token to predict
        return: the most probable token
        '''
        test_tokens = []
        new_saver = tf.train.import_meta_graph(model_dir + '.meta')
        for i in range(window_size):
            prev_token_string = self.token_to_string(prefix[-i-1])
            x = self.one_hot_encoding(prev_token_string)
            test_tokens.extend(x)
        with tf.Session() as sess:
            new_saver.restore(sess, model_dir)
            graph = tf.get_default_graph()
            input_x = graph.get_tensor_by_name('input_x:0')
            accuracy = graph.get_tensor_by_name('accuracy:0')
            output_layer = graph.get_tensor_by_name('output_layer:0')
            feed = {input_x:test_tokens}
            show_accu, prediction = sess.run([accuracy,output_layer], feed)
            print('window_size: %d, accuracy:%.3f'%(window_size, show_accu))
        return show_accu

In [None]:
train_data = load_data.load_tokens(query_dir, is_simplify=True)

In [None]:
cc_model = Code_Completion_Model(train_data)

In [None]:
cc_model.train()

In [None]:
def create_embedding_NN(self):
    input_x = tf.placeholder([None, 1], dtype=tf.int32)
    embedding_dim_size = 64
    embedding_matrix = tf.Variable(tf.random_uniform([
        self.tokens_size, embedding_dim_size], -1, 1), name='embedding_matrix')
    input_embedding_layer = tf.nn.embedding_lookup(
        embedding_matrix, input_x, name='embedding_layer')

    def get_weight(shape):
        initial = tf.random_uniform(shape)
        return tf.Variable(initial, dtype=tf.float32)
    def get_bias(shape):
        initial = tf.constant(shape=shape, value=1.0)
        return tf.Variable(initial, dtype=tf.float32)

    w_fc1 = get_weight([embedding_dim_size, 1024])
    b_fc1 = get_bias([1024])
    z_fc1 = tf.matmul(w_fc1, embedding_matrix) + b_fc1
    z_fc1 = tf.nn.relu(z_fc1)

    w_fc2 = get_weight([1024, 1024])
    b_fc2 = get_bias([1024])
    z_fc2 = tf.matmul(w_fc2, z_fc1) + b_fc2
    z_fc2 = tf.nn.relu(z_fc2)

In [None]:
def create_RNN(self):
    x = tf.placeholder(tf.float32, [None, ])

In [None]:
seee = set((1,2,3,4))
for i, s in enumerate(seee):
    print(i, s)

In [1]:
import numpy as np
import pandas as pd

import data_utils

In [2]:
dataset = data_utils.load_tokens(False)
token_set = data_utils.get_token_set(dataset)

In [19]:
token_list = []
for token in dataset:
    token_list.append(data_utils.token_to_string(token))

In [21]:
token_list[:10]

['Identifier~$$~id',
 'Punctuator~$$~.',
 'Identifier~$$~id',
 'Punctuator~$$~(',
 'String~$$~string',
 'Punctuator~$$~,',
 'Punctuator~$$~{',
 'Identifier~$$~id',
 'Punctuator~$$~:',
 'String~$$~string']

In [3]:
len(token_set)

74

In [4]:
token_cate_list = list(token_set)

In [11]:
token_table = pd.DataFrame(np.zeros(len(token_set)*len(token_set)).reshape(len(token_set), len(token_set)),index=token_cate_list,columns=token_cate_list)

In [22]:
for index, token in enumerate(token_list):
    if index > 0:
        cur_token = token
        prev_token = token_list[index-1]
        token_table[cur_token][prev_token] += 1

In [23]:
token_table

Unnamed: 0,Keyword~$$~throw,Punctuator~$$~<=,RegularExpression~$$~RegularExpression,Punctuator~$$~[,Punctuator~$$~>>>,Keyword~$$~continue,Boolean~$$~false,Punctuator~$$~(,Punctuator~$$~&&,Keyword~$$~return,...,Punctuator~$$~!,Punctuator~$$~>,Punctuator~$$~!=,Keyword~$$~default,Keyword~$$~typeof,Punctuator~$$~+=,Identifier~$$~id,Punctuator~$$~++,Punctuator~$$~--,Keyword~$$~for
Keyword~$$~throw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
Punctuator~$$~<=,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0
RegularExpression~$$~RegularExpression,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Punctuator~$$~[,0.0,0.0,0.0,19.0,0.0,0.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1245.0,0.0,0.0,0.0
Punctuator~$$~>>>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Keyword~$$~continue,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boolean~$$~false,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Punctuator~$$~(,0.0,0.0,85.0,121.0,0.0,0.0,32.0,89.0,0.0,0.0,...,397.0,0.0,0.0,0.0,158.0,0.0,8614.0,6.0,2.0,0.0
Punctuator~$$~&&,0.0,0.0,4.0,10.0,0.0,0.0,0.0,51.0,0.0,0.0,...,72.0,0.0,0.0,0.0,15.0,0.0,328.0,0.0,0.0,0.0
Keyword~$$~return,0.0,0.0,0.0,36.0,0.0,0.0,56.0,64.0,0.0,0.0,...,17.0,0.0,0.0,0.0,7.0,0.0,1048.0,0.0,0.0,0.0


In [25]:
token_table.to_csv('test_token_table.csv')

In [17]:
pd_test = pd.DataFrame(np.zeros(16).reshape(4,4))
pd_test

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


In [18]:
pd_test[1][3] = 22
pd_test

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,22.0,0.0,0.0
