In [None]:
import glob
import os
import msgpack
from tqdm.auto import tqdm

from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import tensorflow as tf
import tensorflow.keras as keras
import keras.models as models
import keras.layers as layers
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import parmap

In [None]:
train_dataset_path = input("Enter the path of the train dataset: ")
files = glob.glob(os.path.join(train_dataset_path, '*.msgpack'))

In [None]:
dataset = []
for f in tqdm(files):
    data = msgpack.load(open(f, 'rb'))
    dataset.append((data[0], int(data[1])))

In [None]:
features = []
labels = []

for d in tqdm(dataset):
    features.append(' '.join(d[0]))
    labels.append(d[1])

del dataset

In [None]:
tokenizer = Tokenizer()

flag_load = False
# if word_index.ascii.msgpack is exists then load word_index from file
# else use tokenizer.fit_on_texts
if os.path.isfile('word_index.opcode.msgpack'):
    with open('word_index.opcode.msgpack', 'rb') as f:
        tokenizer.word_index = msgpack.load(f)
        flag_load = True
else:
    tokenizer.fit_on_texts(tqdm(features))

In [None]:
# Save word index to msgpack files
if flag_load == False:
    with open('word_index.opcode.msgpack', 'wb') as f:
        msgpack.dump(tokenizer.word_index, f)

In [None]:
sequences = tokenizer.texts_to_sequences(tqdm(features))
padded_sequences = keras.preprocessing.sequence.pad_sequences(tqdm(sequences), padding='post', maxlen=100)

In [None]:
model = models.Sequential()
model.add(layers.Embedding(len(tokenizer.word_index), output_dim=100, input_length=100))
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.LSTM(128))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='Nadam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_sequences, np.array(labels), epochs=20)

In [None]:
# 모델 평가
test_dataset_path = input("Enter the path of the test dataset: ")
test_files = glob.glob(os.path.join('test_dataset', '*.msgpack'))

test_dataset = []
for f in tqdm(test_files):
    data = msgpack.load(open(f, 'rb'))
    test_dataset.append((data[0], int(data[1])))

In [None]:
test_features = []
test_labels = []

for d in tqdm(test_dataset):
    test_features.append(' '.join(d[0]))
    test_labels.append(d[1])

del test_dataset

test_sequences = tokenizer.texts_to_sequences(tqdm(test_features))
test_padded_sequences = keras.preprocessing.sequence.pad_sequences(tqdm(test_sequences), padding='post', maxlen=100)

In [None]:
loss, accuracy = model.evaluate(x=test_padded_sequences, y=np.array(test_labels))

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}%')

In [None]:
# Save Model
model.save('models/opcode.h5')