In [None]:
import glob
import os
import msgpack
from tqdm.auto import tqdm

from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import tensorflow as tf
import tensorflow.keras as keras
import keras.models as models
import keras.layers as layers
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd

In [None]:
train_dataset_path = input("Enter the path of the train dataset: ")
files = glob.glob(os.path.join(train_dataset_path, '*.ascii.msgpack'))

print(f'Dataset Lenght: {len(files)}')

In [None]:
truncate_len = 128
truncate = lambda x: x[:truncate_len]

In [None]:
dataset = []
for f in tqdm(files):
    dataset.append(msgpack.load(open(f, 'rb')))

In [None]:
features = []
labels = []

for data in tqdm(dataset):
    features.append(' '.join([truncate(s) for s in data[0]]))
    labels.append(data[1])

del dataset # 메모리 절약을 위한 사용하지 않는 메모리 해제

# Print some sample
print(f'Features: {features[0]}')
print(f'Labels: {labels[0]}')

In [None]:
tokenizer = Tokenizer()

flag_load = False
# if word_index.ascii.msgpack is exists then load word_index from file
# else use tokenizer.fit_on_texts
if os.path.isfile('word_index.ascii.msgpack'):
    with open('word_index.ascii.msgpack', 'rb') as f:
        tokenizer.word_index = msgpack.load(f)
        flag_load = True
else:
    tokenizer.fit_on_texts(tqdm(features))

In [None]:
# Save word index to msgpack files
if flag_load == False:
    with open('word_index.ascii.msgpack', 'wb') as f:
        msgpack.dump(tokenizer.word_index, f)

In [None]:
seq = tokenizer.texts_to_sequences(tqdm(features))
pad_seq = keras.preprocessing.sequence.pad_sequences(tqdm(seq), padding='post', maxlen=100)

In [None]:
model = models.Sequential()
model.add(layers.Embedding(100000, output_dim=100, input_length=100))
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.LSTM(128))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
# 모델 정보 및 Shape 출력
model.summary()
print(pad_seq.shape)
print(np.array(labels).shape)

In [None]:
# 모델 학습하고 저장하기
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(pad_seq, np.array(labels), epochs=20)

In [None]:
# 모델 평가하기
test_dataset_path = input("Enter the path of the test dataset: ")
test_files = glob.glob('test_set/strings/*.ascii.msgpack')

test_dataset = []
for f in tqdm(test_files):
    test_dataset.append(msgpack.load(open(f, 'rb')))

test_features = []
test_labels = []

for data in tqdm(test_dataset):
    test_features.append(' '.join([truncate(s) for s in data[0]]))
    test_labels.append(int(data[1]))
del test_dataset # 메모리 절약을 위한 사용하지 않는 메모리 해제

In [None]:
test_seq = tokenizer.texts_to_sequences(tqdm(test_features))
test_pad_seq = keras.preprocessing.sequence.pad_sequences(tqdm(test_seq), padding='post', maxlen=100)

In [None]:
loss, accuracy = model.evaluate(x=test_pad_seq, y=np.array(test_labels))

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}%')

In [None]:
model.save('models/ascii.h5')