## Code Completion System

This is a JavaScript Code Prediction System

In [1]:
import tensorflow as tf
import numpy as np
import pickle
import data_utils
import time


  from ._conv import register_converters as _register_converters


In [2]:
processed_data_path = 'processed_data/rnn_train_data.p'
data_parameter_path = 'processed_data/rnn_train_parameter.p'
tensorboard_log_path = 'logs/MultiRNN'

train_dir = 'dataset/programs_800'
test_dir = 'dataset/programs_200'
checkpoint_dir = 'checkpoints/'

num_epoches = 1
show_every_n = 50
save_every_n = 200

In [3]:
import json
import os

def load_tokens(train_flag=True, is_simplify=True):
    if train_flag:
        token_dir = train_dir
    else:
        token_dir = test_dir
    token_list = []
    for f in os.listdir(token_dir):
        file_path = os.path.join(token_dir, f)
        if os.path.isfile(file_path) and f.endswith('_tokens.json'):
            #print(file_path)
            token_seq = json.load(open(file_path, encoding='utf-8'))
            token_list.extend(token_seq)
    string_token_list = []
    for token in token_list:
        if is_simplify:
            data_utils.simplify_token(token)
        string_token = data_utils.token_to_string(token)
        string_token_list.append(string_token)
    token_set = list(set(string_token_list))
    #print(string_token_list[:10])
    string2int = {c:i for i,c in enumerate(token_set)}
    int2string = {i:c for i,c in enumerate(token_set)}
    int_token_list = [string2int[c] for c in string_token_list]
    #print(int_token_list[:10])
    pickle.dump((int_token_list), open(processed_data_path, 'wb'))
    pickle.dump((string2int, int2string, token_set), open(data_parameter_path, 'wb'))

In [9]:
'''
Using MultiRNN to pridect token. with LSTM cell
'''
class LSTM_Model(object):
    def __init__(self,
                 token_set, time_steps=100,
                 batch_size=64,
                 num_layers=2,
                 n_units=128,
                 learning_rate=0.003,
                 grad_clip=5,
                 keep_prob=0.5,
                 num_epoches = 5,
                 is_training=True):
        
        if is_training:
            self.time_steps = time_steps
            self.batch_size = batch_size
        else:
            self.time_steps = 1
            self.batch_size = 1
        
        self.token_set =  token_set
        self.num_classes = len(self.token_set)
        self.num_layers = num_layers
        self.n_units = n_units
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
        self.keep_prob = keep_prob
        self.num_epoches = num_epoches

        self.bulid_model()


    def get_batch(self, data_seq, n_seq, n_steps):
        '''
        :param n_seq: 一个batch中序列的个数
        :param n_steps: 单个序列中包含字符的个数
        '''
        data_seq = np.array(data_seq)
        batch_size = n_steps * n_seq
        n_batches = len(data_seq) // batch_size
        data_seq = data_seq[:batch_size * n_batches] #仅保留完整的batch，舍去末尾
        data_seq = data_seq.reshape((n_seq, -1))
        for n in range(0, data_seq.shape[1], n_steps):
            x = data_seq[:, n:n+n_steps]
            y = np.zeros_like(x)
            y[:, :-1], y[:, -1] = x[:, 1:], x[:,0]
            yield x, y

    def build_input(self):
        input_x = tf.placeholder(tf.int32, [self.batch_size, self.time_steps], name='input_x')
        target_y = tf.placeholder(tf.int32, [self.batch_size, self.time_steps], name='target_y')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        return input_x, target_y, keep_prob

    def bulid_lstm(self, keep_prob):
        cell_list = []
        for i in range(self.num_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(self.n_units, state_is_tuple=True)
            cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
            cell_list.append(cell)
        cells = tf.contrib.rnn.MultiRNNCell(cell_list, state_is_tuple=True)
        init_state = cells.zero_state(self.batch_size, dtype=tf.float32)

        return cells, init_state

    def bulid_output(self, lstm_output):
        # 将lstm_output的形状由[batch_size, time_steps, n_units] 转换为 [batch_size*time_steps, n_units]
        seq_output = tf.concat(lstm_output, axis=1)
        seq_output = tf.reshape(seq_output, [-1, self.n_units])

        with tf.variable_scope('softmax'):
            softmax_w = tf.Variable(tf.truncated_normal([self.n_units, self.num_classes], stddev=0.1))
            softmax_b = tf.Variable(tf.zeros(self.num_classes))

        logits = tf.matmul(seq_output, softmax_w) + softmax_b
        softmax_output = tf.nn.softmax(logits=logits, name='softmax_output')
        return softmax_output, logits

    def bulid_loss(self, logits, targets):
        one_hot_y = tf.one_hot(targets, self.num_classes)
        one_hot_y = tf.reshape(one_hot_y, logits.get_shape())
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=one_hot_y)
        loss = tf.reduce_mean(loss)
        return loss

    def bulid_optimizer(self,loss):
        # tvars = tf.trainable_variables()
        # grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), self.grad_clip)
        # optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # optimizer = optimizer.apply_gradients(zip(grads, tvars))
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradient_pairs = optimizer.compute_gradients(loss)
        clip_gradient_pairs = []
        for grad, var in gradient_pairs:
            grad = tf.clip_by_value(grad, -2, 2)
            clip_gradient_pairs.append((grad, var))
        optimizer = optimizer.apply_gradients(clip_gradient_pairs)
        return optimizer
    
    def build_accuracy(self, logits, targets):
#         print(logits.get_shape())
#         print(targets.get_shape())
        sess = tf.Session()
        self.show_logits = tf.argmax(logits, axis=1)
        show_targets = tf.one_hot(targets, self.num_classes)
        show_targets = tf.reshape(show_targets, logits.get_shape())
        self.show_targets = tf.argmax(show_targets, axis=1)
        self.aaa = tf.equal(self.show_logits, self.show_targets)
        accu = tf.cast(self.aaa, tf.float32)
        accu = tf.reduce_mean(accu)
        return accu
        

    def bulid_model(self):
        tf.reset_default_graph()
        self.input_x, self.target_y, self.keep_prob = self.build_input()
        self.cell, self.init_state = self.bulid_lstm(self.keep_prob)
        one_hot_x = tf.one_hot(self.input_x, self.num_classes)
        #print(one_hot_x.get_shape()) # (64, 100, 86)
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            self.cell, one_hot_x, initial_state=self.init_state)
        #print(1, lstm_outputs.get_shape()) # (64, 100, 128)
        self.softmax_output, logits = self.bulid_output(lstm_outputs)
        #print(self.softmax_output.get_shape()) # (6400, 86)
        #print(logits.get_shape()) #(6400, 86)
        self.loss = self.bulid_loss(logits,self.target_y)
        self.accuracy = self.build_accuracy(self.softmax_output, self.target_y)
        self.optimizer = self.bulid_optimizer(self.loss)


    def train(self, data, string2int, int2string):
        print('training begin...')
        self.string2int = string2int
        self.int2string = int2string
        saver = tf.train.Saver(max_to_keep=100)
        keep_prob = 0.5
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            global_step = 0
            for epoch in range(self.num_epoches):
                new_state = sess.run(self.init_state)
                batch_generator = self.get_batch(data, self.batch_size, self.time_steps)
                batch_step = 0
                start_time = time.time()
                for x, y in batch_generator:
                    global_step += 1
                    batch_step += 1
                    feed = {self.input_x:x,
                            self.target_y:y,
                            self.keep_prob:keep_prob,
                            self.init_state:new_state}
                    show_accu, show_loss, new_state, _ = sess.run(
                        [self.accuracy, self.loss, self.final_state, self.optimizer], feed_dict=feed)
                    end_time = time.time()
                    if global_step%show_every_n == 0:
                        a, b,c = sess.run([self.show_logits, self.show_targets,self.aaa], feed)
                        print(a[:10])
                        print(b[:10])
                    if global_step % show_every_n == 0:
                        print('epoch: {}/{}..'.format(epoch+1, self.num_epoches),
                              'global_step: {}..'.format(global_step),
                              'train_loss: {:.2f}..'.format(show_loss),
                              'train_accuracy: {:.2f}..'.format(show_accu),
                              'time cost in per_batch: {:.2f}..'.format(end_time-start_time))

                    if global_step % save_every_n == 0:
                        saver.save(sess, 'checkpoints/epoch{}_batch_step{}'.format(epoch, batch_step))
            saver.save(sess, 'checkpoints/last_check')

In [10]:
train_data = data_utils.load_data_with_pickle(processed_data_path)
string2int, int2string, token_set = data_utils.load_data_with_pickle(data_parameter_path)

In [None]:
print(int2string[22])

In [None]:
model = LSTM_Model(token_set)
model.train(train_data, string2int, int2string)

In [11]:
class TestModel(object):
    def __init__(self, token_set,string2int, int2string):
        self.model = LSTM_Model(token_set, is_training=False)
        self.string2int = string2int
        self.int2string = int2string
        self.last_chackpoints = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)

    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole
        '''
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, self.last_chackpoints)
            new_state = sess.run(self.model.init_state)
            prediction = None
            for i,token in enumerate(prefix):
                x = np.zeros((1, 1), dtype=np.int32)
                x[0,0] = token
                feed = {self.model.input_x:x,
                        self.model.keep_prob:1.,
                        self.model.init_state:new_state}
                prediction, new_state = sess.run(
                    [self.model.softmax_output, self.model.final_state], feed_dict=feed)
        prediction = self.int2string[np.argmax(prediction)]
        return prediction

    def test(self, query_test_data):
        correct = 0.0
        correct_token_list = []
        incorrect_token_list = []
        for token_sequence in query_test_data:
            prefix, expection, suffix = data_utils.create_hole(token_sequence)
            prefix = self.token_to_int(prefix)
            prediction = self.query_test(prefix, suffix)
            prediction = data_utils.string_to_token(prediction)
            if data_utils.token_equals([prediction], expection):
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(query_test_data)
        return accuracy
    
    def token_to_int(self, token_seq):
        int_token_seq = []
        for token in token_seq:
            int_token = self.string2int[data_utils.token_to_string(token)]
            int_token_seq.append(int_token)
        return int_token_seq

In [5]:
test_data = data_utils.load_data_with_file(test_dir)

In [12]:
test_model = TestModel(token_set, string2int, int2string)
accuracy = test_model.test(test_data)
print(accuracy)

Instructions for updating:
Use the retry module or similar alternatives.
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:R

INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring parameters from checkpoints/last_check
INFO:tensorflow:Restoring

In [None]:
import tensorflow as tf
import numpy as np
import pickle
import time
import data_utils

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



'''
使用TensorFlow自带的layers构建基本的神经网络对token进行预测，
可以声明使用多少个context tokens 进行预测

多个previous token输入神经网络的方法有两种想法：
1. 将每个token的representation vector相连，合成一个大的vector输入到神经网络，
    所以说神经网络的输入层大小应为：每个token vector length * number of previous token
2. 应为目前表示每个token 使用的方法为one hot encoding，也就是说对每个token都是有且仅有一位为1，其余位为0，
    所以可以考虑直接将所有的previous token相加，这样做的好处是NN输入层大小永远等于vector length。缺点是没有理论依据，不知道效果是否会更好


1. concatenate the representations of previous tokens to a huge vector representation
2. add the representations of previous tokens together


'''

x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
query_dir = 'dataset/programs_200/'

tensorboard_data_path = './logs/MultiContext/5_previous'

epoch_num = 2
batch_size = 64
learning_rate = 0.002
context_size = 5
hidden_size = 64

In [None]:
class Code_Completion_Model:

    def __init__(self, x_data, y_data, token_set, string2int, int2string, add_or_concat='add'):
        batch_num = len(x_data) // batch_size
        x_data, y_data = np.array(x_data[:batch_num * batch_size]), np.array(y_data[:batch_num * batch_size])
        if add_or_concat == 'add':
            temp_x, temp_y = self.reshape_with_add(x_data, y_data)
        if add_or_concat == 'concat':
            temp_x, temp_y = self.reshape_with_concat(x_data, y_data)
        self.x_data, self.valid_x, self.y_data, self.valid_y = \
            train_test_split(temp_x, temp_y, train_size=0.9)
        self.data_size = len(self.x_data)
        self.index_to_string = int2string
        self.string_to_index = string2int
        self.tokens_set = token_set
        self.tokens_size = len(token_set)

    def reshape_with_concat(self, x_data, y_data):
        reshape_data = []
        reshape_label = []
        for i in range(len(x_data)):
            if i >= context_size-1:
                temp_list = []
                for x in range(context_size):
                    temp_list.extend(x_data[i-x])
                reshape_data.append(temp_list)
                reshape_label.append(y_data[i])
        return reshape_data, reshape_label
        
    def reshape_with_add(self, x_data, y_data):
        x = []
        y = []
        for index, token in enumerate(x_data):
            if index >= context_size - 1:
                tempTokens = np.sum(x_data[index - context_size + 1:index + 1, :], axis=0)
                x.append(tempTokens)
                y.append(y_data[index])
        return x, y;

    # neural network functions
    def create_NN(self):
        tf.reset_default_graph()
        self.input_x = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='input_x')
        self.output_y = tf.placeholder(dtype=tf.float32, shape=[None, self.tokens_size], name='output_y')
        weights = {'h1': tf.Variable(tf.truncated_normal(shape=[self.tokens_size, hidden_size])),
                   'h2': tf.Variable(tf.truncated_normal(shape=[hidden_size, hidden_size])),
                   'h3': tf.Variable(tf.truncated_normal(shape=[hidden_size, hidden_size])),
                   'output': tf.Variable(tf.truncated_normal(shape=[hidden_size, self.tokens_size]))}
        biases = {'h1': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'h2': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'h3': tf.Variable(tf.constant(0.1, shape=[hidden_size], dtype=tf.float32)),
                  'output': tf.Variable(tf.constant(0.1, shape=[self.tokens_size], dtype=tf.float32))}

        h1_layer = tf.matmul(self.input_x, weights['h1']) + biases['h1']
        h1_layer = tf.nn.relu(h1_layer)
        h2_layer = tf.matmul(h1_layer, weights['h2']) + biases['h2']
        h2_layer = tf.nn.relu(h2_layer)
        h3_layer = tf.matmul(h2_layer, weights['h3']) + biases['h3']
        h3_layer = tf.nn.relu(h3_layer)
        output_layer = tf.matmul(h3_layer, weights['output']) + biases['output']
        self.prediction = tf.argmax(output_layer, 1)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=self.output_y)
        self.loss = tf.reduce_mean(loss)
        self.optimizer_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        equal = tf.equal(tf.argmax(output_layer, 1), tf.argmax(self.output_y, 1))
        accuracy = tf.cast(equal, tf.float32)
        self.accuracy = tf.reduce_mean(accuracy)
        
#         self.valid_loss = tf.reduce_mean(loss)
#         self.valid_accuracy = tf.reduce_mean(accuracy)

        tf.summary.histogram('weight1', weights['h1'])
        tf.summary.histogram('weight2', weights['h2'])
        tf.summary.histogram('output_weight', weights['output'])
        tf.summary.histogram('bias1', biases['h1'])
        tf.summary.histogram('bias2', biases['h2'])
        tf.summary.histogram('output_bias', biases['output'])
        tf.summary.scalar('train_loss', self.loss)
        tf.summary.scalar('train_accuracy', self.accuracy)
         
        self.merged = tf.summary.merge_all()

    def get_batch(self):
        for i in range(0, len(self.x_data), batch_size):
            batch_x = self.x_data[i:i + batch_size];
            batch_y = self.y_data[i:i + batch_size];
            yield batch_x, batch_y

    def train(self):
        self.create_NN()
        self.sess = tf.Session()
        valid_accu_list = np.zeros(10, dtype=np.float32)
        train_accu_list = np.zeros(10, dtype=np.float32)
        valid_list_index = 0
        train_list_index = 0
        writer = tf.summary.FileWriter(tensorboard_data_path, self.sess.graph)
        time_begin = time.time()
        self.sess.run(tf.global_variables_initializer())
        for epoch in range(epoch_num):
          #  self.x_data, self.y_data = shuffle(self.x_data, self.y_data)
            batch_generator = self.get_batch()
            for i in range(0, len(self.x_data), batch_size):
                batch_x, batch_y = next(batch_generator)
                feed = {self.input_x: batch_x, self.output_y: batch_y}
                _, summary_str = self.sess.run([self.optimizer_op, self.merged], feed_dict=feed)
                writer.add_summary(summary_str, epoch*self.data_size + i)
                writer.flush()
                if (i // batch_size) % 2000 == 0:
                    print('epoch: %d, step: %d'%(epoch, i))
                    train_loss, train_accu = self.sess.run([self.loss, self.accuracy], feed_dict=feed)
                    train_accu_list[train_list_index % 10] = train_accu
                    print('train loss: %.2f, train accuracy:%.3f' % (train_loss, train_accu))
                    print('average train accuracy: %.4f'%(np.mean(train_accu_list)))
                    valid_feed = {self.input_x:self.valid_x, self.output_y:self.valid_y}
                    valid_loss, valid_acc = self.sess.run([self.loss, self.accuracy], feed_dict=valid_feed)
                    valid_accu_list[valid_list_index % 10] = valid_acc
                    print('valid loss: %.2f, valid accuracy:%.3f' % (valid_loss, valid_acc))
                    print('average valid accuracy: %.4f'%(np.mean(valid_accu_list)))
        time_end = time.time()
        print('training time cost: %.3f s' % (time_end - time_begin))

    # query test
    def query_test(self, prefix, suffix):
        '''
        Input: all tokens before the hole token(prefix) and all tokens after the hole token,
        ML model will predict the most probable token in the hole. In this function, use only one token before hole token to predict
        '''
        previous_token_list = prefix[-context_size:]
        context_representation = np.zeros(self.tokens_size)

        for token in previous_token_list:
            prev_token_string = data_utils.token_to_string(token)
            pre_token_x = data_utils.one_hot_encoding(prev_token_string, self.string_to_index)
            context_representation += np.array(pre_token_x)

        feed = {self.input_x: [context_representation]}
        prediction = self.sess.run(self.prediction, feed)[0]
        best_string = self.index_to_string[prediction]
        best_token = data_utils.string_to_token(best_string)
        return [best_token]

    # test model
    def test_model(self, query_test_data):
        correct = 0.0
        correct_token_list = []
        incorrect_token_list = []
        for token_sequence in query_test_data:
            prefix, expection, suffix = data_utils.create_hole(token_sequence)
            prediction = self.query_test(prefix, suffix)[0]
            if data_utils.token_equals([prediction], expection):
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(query_test_data)
        return accuracy

In [None]:
x_train_data_path = 'processed_data/x_train_data.p'
y_train_data_path = 'processed_data/y_train_data.p'
train_data_parameter = 'processed_data/x_y_parameter.p'
x_data = data_utils.load_data_with_pickle(x_train_data_path)
y_data = data_utils.load_data_with_pickle(y_train_data_path)
token_set, string2int, int2string = data_utils.load_data_with_pickle(train_data_parameter)

In [None]:
#model train
model = Code_Completion_Model(x_data, y_data, token_set, string2int, int2string)
model.train()

In [None]:
# test model
query_test_data = data_utils.load_data_with_file(query_dir)
accuracy = model.test_model(query_test_data)
print('query test accuracy: ', accuracy)

In [None]:
import random
from collections import Counter
import data_utils
string_processed_data_path = 'processed_data/str_train_data.p'

class Markov_Model(object):

    def __init__(self, max_length=1, is_most=False):
        self.markov_table = {}
        self.max_length = 1
        self.is_most = False

    def create_model(self, token_list, max_depth=1, is_most=False):
        '''
        create a markov model with the depth from 1 to max_depth
        {
            depth1:{
                key1:[value1, value2 ..]
            }
        }
        '''
        self.is_most = is_most
        self.max_length = max_depth
        for depth in range(1, max_depth+1):
            temp_table = {}
            for index in range(depth, len(token_list)):
                words = tuple(token_list[index-depth:index])
                if words in temp_table.keys():
                    temp_table[words].append(token_list[index])
                else:
                    temp_table.setdefault(words, []).append(token_list[index])
            if is_most:
                for key,value in temp_table.items():
                    temp = Counter(value).most_common(1)[0][0]
                    temp_table[key] = temp
                self.markov_table[depth] = temp_table
            else:
                self.markov_table[depth] = temp_table
        return self.markov_table

    def test_model(self, test_token_lists, depth=1):
        correct = 0
        correct_token_list = []
        incorrect_token_list = []

        for tokens in test_token_lists:
            prefix, expection, suffix = data_utils.create_hole(tokens)
            prediction = self.query_test(prefix, depth=depth)
            if prediction['type']==expection[0]['type'] and prediction['value'] == expection[0]['value']:
                correct += 1
                correct_token_list.append({'expection': expection, 'prediction': prediction})
            else:
                incorrect_token_list.append({'expection': expection, 'prediction': prediction})
        accuracy = correct / len(test_token_lists)
        return accuracy


    def query_test(self, pre_tokens, depth=1):
        while(depth>self.max_length):
            depth -= 1
        used_tokens = pre_tokens[-depth:]
        proceed_tokens = []
        for token in used_tokens:
            proceed_tokens.append(data_utils.token_to_string(token))
        proceed_tokens = tuple(proceed_tokens)
        while proceed_tokens not in self.markov_table[depth].keys() and depth > 1:
            depth -= 1
            proceed_tokens = tuple(proceed_tokens[-depth:])

        if self.is_most:
            candidate = self.markov_table[depth][proceed_tokens]
        else:
            candidate_list = self.markov_table[depth][proceed_tokens]
            random_index = random.randint(0, len(candidate_list)-1)
            candidate = candidate_list[random_index]
        prediction = data_utils.string_to_token(candidate)
        return prediction

In [None]:
string_token_list = data_utils.load_data_with_pickle(string_processed_data_path)

In [None]:
markov_model = Markov_Model()
markov_table = markov_model.create_model(string_token_list, max_depth=6, is_most=True)

In [None]:
#print(markov_table[1].keys())
#print(markov_table[1][('Keyword~$$~var',)])
#print(markov_table[2].keys())
test_token_sequences = data_utils.load_data_with_file()
accuracy = 0.0
test_epoch = 10
for i in range(test_epoch):
    accuracy += markov_model.test_model(test_token_sequences, depth=6)
accuracy /= test_epoch
print(accuracy)

In [None]:
import numpy as np
data = np.arange(40).reshape((8,5))
label = np.arange(8)
print(data)
print(label)
size = 3
reshape_data = []
reshape_label = []
for i in range(len(data)):
    if i >= size-1:
        temp_list = []
        for x in range(size):
            temp_list.extend(data[i-x])
        reshape_data.append(temp_list)
        reshape_label.append(label[i])
        
print(reshape_data)
print(reshape_label)