In [133]:
import numpy as np
np.random.seed(1337)  # for reproducibility
import tensorflow as tf
from tensorflow.python.ops import array_ops
from keras import backend as K
from keras import regularizers
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import json

In [2]:
with open('7-gram.json', 'r') as f:
    grams = json.load(fp=f)
with open('functions.json', 'r') as f:
    functions = json.load(fp=f)
    
print('grams: ', len(grams))
print('functions: ', len(functions))

grams:  315027
functions:  33867


In [3]:
# this are the functions to change intger to binary number and the reverse way
def int_to_bits(a):
    return format(a if a >= 0 else (1<<32) + a, '032b')

def bits_to_int(b):
    return np.int32(np.uint32(int(b, 2))).item()


# this function changes the binary string into a numpy array
def bits_to_array(c):
    return np.frombuffer(b"%b" % c.encode('utf-8'), 'u1') - ord('0')

In [4]:
# this part assign different binary string to each gram and store them in a dictionary
encode_gram = {}
for gram in enumerate(sorted(grams)):
    encode_gram[gram[1]] = int_to_bits(gram[0] + 1)
with open('encode_gram.json', 'w') as f:
    json.dump(encode_gram, f)

In [5]:
# this function generate the 7-gram features of the java function
def generate_features(lines):
    function = ''
    for line in lines:
        function += line
    tokens = function.split()
    length = len(tokens)
    result = []
    if length < 7:
        string = ''
        for i in range(length):
            if i != length - 1:
                string = string + tokens[i] + " "
            else:
                string = string + tokens[i]
    else:
        for i in range(length -7 + 1):
            string = ''
            for token in tokens[i:i+7]:
                string = string + token + " "
            string = string[:-1]
            result.append(string)
    return result

In [6]:
# this step get all the data into a big numpy array
data = []

for file in functions:
    temp = np.array([])
    for function in functions[file]:
        for gram in generate_features(functions[file][function]):
            temp = np.concatenate([temp, bits_to_array(encode_gram[gram])])
        
    temp = temp.reshape(-1, 32)
    data.append(temp)

print('done')

data = np.array(data)
print(data.shape)


done
(33867,)


In [7]:
# this method fill up the missing value with zero and make sure all the arrays are in the same shape
max_length = data[0].shape
length = []
for i in data:
    length.append(i.shape[0])
    if i.shape[0] > max_length[0]:
        max_length = i.shape
print('median: ', np.median(length))
print('average: ', np.average(length))

median:  522.0
average:  813.587208787315


In [8]:
# slice the data into (3200, 32)
time_step = 3200
for i in range(len(data)):
    diff = time_step - data[i].shape[0]
    if diff > 0:
        data[i] = np.pad(data[i], ((0,diff), (0,0)), 'constant')
print('done')

done


In [9]:
dimension = len(data)
X = np.zeros((dimension, time_step, 32))

In [10]:
# slice 0
for d in range(5000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
done


In [11]:
# slice 1
for d in range(5000,10000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
done


In [12]:
# slice 2
for d in range(10000,15000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

10000
10050
10100
10150
10200
10250
10300
10350
10400
10450
10500
10550
10600
10650
10700
10750
10800
10850
10900
10950
11000
11050
11100
11150
11200
11250
11300
11350
11400
11450
11500
11550
11600
11650
11700
11750
11800
11850
11900
11950
12000
12050
12100
12150
12200
12250
12300
12350
12400
12450
12500
12550
12600
12650
12700
12750
12800
12850
12900
12950
13000
13050
13100
13150
13200
13250
13300
13350
13400
13450
13500
13550
13600
13650
13700
13750
13800
13850
13900
13950
14000
14050
14100
14150
14200
14250
14300
14350
14400
14450
14500
14550
14600
14650
14700
14750
14800
14850
14900
14950
done


In [13]:
# slice 3
for d in range(15000,20000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

15000
15050
15100
15150
15200
15250
15300
15350
15400
15450
15500
15550
15600
15650
15700
15750
15800
15850
15900
15950
16000
16050
16100
16150
16200
16250
16300
16350
16400
16450
16500
16550
16600
16650
16700
16750
16800
16850
16900
16950
17000
17050
17100
17150
17200
17250
17300
17350
17400
17450
17500
17550
17600
17650
17700
17750
17800
17850
17900
17950
18000
18050
18100
18150
18200
18250
18300
18350
18400
18450
18500
18550
18600
18650
18700
18750
18800
18850
18900
18950
19000
19050
19100
19150
19200
19250
19300
19350
19400
19450
19500
19550
19600
19650
19700
19750
19800
19850
19900
19950
done


In [14]:
# slice 4
for d in range(20000,25000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

20000
20050
20100
20150
20200
20250
20300
20350
20400
20450
20500
20550
20600
20650
20700
20750
20800
20850
20900
20950
21000
21050
21100
21150
21200
21250
21300
21350
21400
21450
21500
21550
21600
21650
21700
21750
21800
21850
21900
21950
22000
22050
22100
22150
22200
22250
22300
22350
22400
22450
22500
22550
22600
22650
22700
22750
22800
22850
22900
22950
23000
23050
23100
23150
23200
23250
23300
23350
23400
23450
23500
23550
23600
23650
23700
23750
23800
23850
23900
23950
24000
24050
24100
24150
24200
24250
24300
24350
24400
24450
24500
24550
24600
24650
24700
24750
24800
24850
24900
24950
done


In [15]:
# slice 5
for d in range(25000, 30000):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')

25000
25050
25100
25150
25200
25250
25300
25350
25400
25450
25500
25550
25600
25650
25700
25750
25800
25850
25900
25950
26000
26050
26100
26150
26200
26250
26300
26350
26400
26450
26500
26550
26600
26650
26700
26750
26800
26850
26900
26950
27000
27050
27100
27150
27200
27250
27300
27350
27400
27450
27500
27550
27600
27650
27700
27750
27800
27850
27900
27950
28000
28050
28100
28150
28200
28250
28300
28350
28400
28450
28500
28550
28600
28650
28700
28750
28800
28850
28900
28950
29000
29050
29100
29150
29200
29250
29300
29350
29400
29450
29500
29550
29600
29650
29700
29750
29800
29850
29900
29950
done


In [16]:
# slice 6
for d in range(30000,dimension):
    if d % 50 == 0:
        print(d)
    for g in range(time_step):
        for b in range(len(data[d][g])):
            X[d,g,b] = data[d][g][b]
print('done')
print(X.shape)

30000
30050
30100
30150
30200
30250
30300
30350
30400
30450
30500
30550
30600
30650
30700
30750
30800
30850
30900
30950
31000
31050
31100
31150
31200
31250
31300
31350
31400
31450
31500
31550
31600
31650
31700
31750
31800
31850
31900
31950
32000
32050
32100
32150
32200
32250
32300
32350
32400
32450
32500
32550
32600
32650
32700
32750
32800
32850
32900
32950
33000
33050
33100
33150
33200
33250
33300
33350
33400
33450
33500
33550
33600
33650
33700
33750
33800
33850
done
(33867, 3200, 32)


In [97]:
data = X
data = data.reshape(data.shape[0],-1)
print(data.shape)

# in order to plot in a 2D figure
encoding_dim = 24

# input dim
dim = 32*time_step
inputs = Input(shape=(dim,))

(33867, 102400)


In [156]:
# loss func
def se_loss(y_true, y_pred):
    K_sum = K.sum(K.square(y_pred - y_true), axis=-1)
#     array = np.where(K_sum > 0, 1, 0)
#     sum_arr = tf.convert_to_tensor(array, np.float64)
    return K.tanh(K_sum)

# a = X[:1]
# b = X[1:2]
# print(se_loss(a,b))
# out = se_loss(a,b).eval()
# print(out)

Tensor("Const_1490:0", shape=(1, 3200), dtype=float64)
[[1. 1. 1. ... 0. 0. 0.]]


In [157]:
# encoder layers
encoded = Dense(100, activation='relu')(inputs)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(64, activation='relu')(encoded)
encoder_out = Dense(encoding_dim, activation='tanh')(encoded)

# decoder layers
decoded = Dense(64, activation='relu')(encoder_out)
decoded = Dense(64, activation='relu')(decoded)
decoded = Dense(100, activation='relu')(decoded)
decoded = Dense(dim, activation='tanh')(decoded)

sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoder_out)

# compile autoencoder
adam = Adam(0.001)
sequence_autoencoder.compile(optimizer=adam, loss=se_loss, metrics=['accuracy'])

encoder.summary()
print("Training ... ...")
sequence_autoencoder.fit(data, data,
                epochs=50,
                batch_size=512,
                shuffle=True)

InvalidArgumentError: You must feed a value for placeholder tensor 'input_8' with dtype float
	 [[Node: input_8 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'input_8', defined at:
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-97-b49a89c1113a>", line 10, in <module>
    inputs = Input(shape=(dim,))
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/keras/engine/topology.py", line 1414, in Input
    input_tensor=tensor)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/keras/engine/topology.py", line 1325, in __init__
    name=self.name)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 391, in placeholder
    x = tf.placeholder(dtype, shape=shape, name=name)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1507, in placeholder
    name=name)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1997, in _placeholder
    name=name)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/gpfsnyu/packages/lpython/3.5/intelpython3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input_8' with dtype float
	 [[Node: input_8 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


In [107]:
encoded_test1 = encoder.predict(X[184:185].reshape(X[:1].shape[0],-1))
encoded_test2 = encoder.predict(X[8:9].reshape(X[:1].shape[0],-1))


print(X[1500:1501] == X[3:4])

print(np.where(encoded_test1 > 0, 1, 0) == np.where(encoded_test2 > 0, 1, 0))

xshape = X[:1].shape

encoded_test3 = sequence_autoencoder.predict(X[:1].reshape(X[:1].shape[0],-1))
encoded_test3_out = np.where(encoded_test3 > 0, 1, 0)

encoded_test4 = sequence_autoencoder.predict(X[1500:1501].reshape(X[:1].shape[0],-1))
encoded_test4_out = np.where(encoded_test3 > 0, 1, 0)

encoded_test5 = sequence_autoencoder.predict(X[3:4].reshape(X[:1].shape[0],-1))
encoded_test5_out = np.where(encoded_test3 > 0, 1, 0)

print(encoded_test5_out.reshape(-1,32) == encoded_test4_out.reshape(-1,32))

print(xshape)
print(np.in1d(encoded_test3_out, X[:1]))

[[[ True  True  True ...  True False False]
  [ True  True  True ...  True False False]
  [ True  True  True ...  True False False]
  ...
  [ True  True  True ...  True  True  True]
  [ True  True  True ...  True  True  True]
  [ True  True  True ...  True  True  True]]]
[[ True False False False False False False False False  True False False
  False  True  True False False  True  True  True  True  True False  True]]
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]
(1, 3200, 32)
[ True  True  True ...  True  True  True]


In [None]:
'''9, 100, 185, 424, 487, 547, 601, 697, 766, 767, 776, 1323, 1354, 1380,'''