In [1]:
import tensorflow
import numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import  Embedding, Dense, Flatten, LSTM, Dropout, BatchNormalization, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
import io
import json
import os

In [2]:
samples = []
ids = []
with open('train.txt') as text:
    inc = 0
    lines = text.readlines()
    lines = list(dict.fromkeys(lines))
    for line in lines:
        inc = inc + 1
        if inc == 20000:
            break
        #print(line.split('\\'))
        splittedLines = line.split('\\')
        if len(splittedLines) != 1 and splittedLines[0] != 'TradeBot':
            samples.append(splittedLines[1])
            ids.append(''.join(e for e in splittedLines[0] if e.isalnum()))

In [3]:
len(samples)

19613

In [4]:
len(ids)

19613

In [5]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(samples)
tokenizerForNames = Tokenizer(num_words=1000)
tokenizerForNames.fit_on_texts(ids)

# saving
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

tokenizerForNames_json = tokenizerForNames.to_json()
with io.open('tokenizerForNames.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizerForNames_json, ensure_ascii=False))

In [6]:
sequences = tokenizer.texts_to_sequences(samples)
tokenizedNames = tokenizerForNames.texts_to_sequences(numpy.asarray(ids))
len(sequences) - len(tokenizedNames)


0

In [7]:
x_train = preprocessing.sequence.pad_sequences(sequences, maxlen=40)
y_train = to_categorical(tokenizedNames, dtype=int);
print(len(x_train))
print(len(x_train) - len(y_train))

19613
0


In [8]:
x_train.shape
y_train[:,1:].shape[1]

57

In [13]:
model = Sequential()
model.add(Embedding(10000, 64,input_length=40))
model.add(Conv1D(32, 8, activation='relu'))
model.add(MaxPooling1D(5))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(y_train[:,1:].shape[1], activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
metrics=['acc']
model.summary()

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tensorflow.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 64)            640000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 33, 32)            16416     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 6, 32)             0         
_________________________________________________________________
batch_normalization_10 (Batc (None, 6, 32)             128       
_________________________________________________________________
dropout_10 (Dropout)         (None, 6, 32)             0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 6, 128)            49664     
_________________________________________________________________
batch_normalization_11 (Batc (None, 6, 128)           

In [14]:
history = model.fit(x_train,
                    y_train[:,1:],
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[cp_callback])

Epoch 1/10
Epoch 00001: saving model to training_1\cp.ckpt
Epoch 2/10
Epoch 00002: saving model to training_1\cp.ckpt
Epoch 3/10
Epoch 00003: saving model to training_1\cp.ckpt
Epoch 4/10
Epoch 00004: saving model to training_1\cp.ckpt
Epoch 5/10
Epoch 00005: saving model to training_1\cp.ckpt
Epoch 6/10
Epoch 00006: saving model to training_1\cp.ckpt
Epoch 7/10
Epoch 00007: saving model to training_1\cp.ckpt
Epoch 8/10
Epoch 00008: saving model to training_1\cp.ckpt
Epoch 9/10
Epoch 00009: saving model to training_1\cp.ckpt
Epoch 10/10
Epoch 00010: saving model to training_1\cp.ckpt


In [269]:
numpy.transpose([tokenizedNames,ids])

array([[list([2]), 'Giallar'],
       [list([6]), 'TBPSS'],
       [list([6]), 'TBPSS'],
       ...,
       [list([2]), 'Giallar'],
       [list([1]), 'nnnn1111'],
       [list([1]), 'nnnn1111']], dtype=object)

In [270]:
model.predict(x_train)

array([[1.30268678e-01, 1.59834355e-01, 1.46082804e-01, ...,
        7.94296328e-04, 4.03043145e-04, 4.17473639e-04],
       [1.21235862e-01, 1.53325483e-01, 1.19691685e-01, ...,
        8.48133350e-04, 5.48269076e-04, 5.14014275e-04],
       [1.21235862e-01, 1.53325483e-01, 1.19691685e-01, ...,
        8.48133350e-04, 5.48269076e-04, 5.14014275e-04],
       ...,
       [1.21920735e-01, 1.08901031e-01, 1.56914830e-01, ...,
        8.43893096e-04, 4.26757295e-04, 4.41761833e-04],
       [6.69955373e-01, 3.61766405e-02, 1.59100667e-01, ...,
        3.13478144e-04, 1.75582449e-04, 1.80902425e-04],
       [6.69955373e-01, 3.61766405e-02, 1.59100667e-01, ...,
        3.13478144e-04, 1.75582449e-04, 1.80902425e-04]], dtype=float32)

In [271]:
output = numpy.asarray(list(map(numpy.argmax, model.predict(x_train)))).reshape(len(x_train),1)
print(output)

[[3]
 [1]
 [1]
 ...
 [3]
 [0]
 [0]]


In [272]:
tokenizerForNames.sequences_to_texts(output + 1)

['penspam',
 'giallar',
 'giallar',
 'kolm5',
 'kolm5',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'giallar',
 'byfreeze',
 'byfreeze',
 'giallar',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'tbpss',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'byfreeze',
 'tbpss',
 'nnnn1111',
 'byfreeze',
 'byfreeze',
 'tbpss',
 'penspam',
 'nnnn1111',
 'tbpss',
 'nnnn1111',
 'tbpss',
 'tbpss',
 'tbpss',
 'tbpss',
 'tbpss',
 'kolm5',
 'tbpss',
 'giallar',
 'tbpss',
 'nnnn1111',
 'giallar',
 'byfreeze',
 'giallar',
 'tbpss',
 'giallar',
 'nnnn1111',
 'byfreeze',
 'tbpss',
 'giallar',
 'byfreeze',
 'byfreeze',
 'nnnn1111',
 'tbpss',
 'nnnn1111',
 'byfreeze',
 'giallar',
 'byfreeze',
 'giallar',
 'giallar',
 'byfreeze',
 'byfreeze',
 'giallar',
 'tbpss',
 'tbpss',
 'tbpss',
 'tbpss',
 'tbpss',
 'byfreeze',
 'tbpss',
 'giallar',
 'byfreeze',
 'byfreeze',
 'nnnn1111',
 'byfreeze',
 'tbpss',
 'byfreeze',
 'nnnn1111',
 'byfreeze',
 'giallar',
 'giallar',
 'nnnn1111',
 'giallar',
 'byfreeze',

In [273]:
tokenizerForNames.sequences_to_texts([[1]])


['nnnn1111']