In [None]:
# If running with Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataFolder = '/content/drive/MyDrive/Colab Notebooks/LM'
import os
os.chdir(dataFolder)

In [None]:
import os
import re
import numpy as np
import argparse
import pickle
import tqdm
%tensorflow_version 1.x
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Embedding
from keras.callbacks import ModelCheckpoint
from keras.backend.tensorflow_backend import set_session

TensorFlow 1.x selected.


Using TensorFlow backend.


In [None]:
# for tensor 1
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
sess = tf.Session(config=config)
set_session(sess)
print("use-gpu:", tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

use-gpu: /device:GPU:0
Num GPUs Available:  1


In [None]:
corpusFile = "dataset2.txt"
corpusSequenceFile = corpusFile[:-4] + "_" + "char_sequences.txt"
seq_length = 16
epochs = 20
part_size = 1024000
batch_size = 256
period = 5

In [None]:
def checkCorpus(string):
    currentDir = os.listdir()
    if (string in currentDir and os.path.isfile(string)):
        return string
    else:
        # print("No folder named %s" % string)
        return -1

def raw_text_cleaner(text):
  newstring = re.sub(r"\[.*?\]", " ", text)
  lines = newstring.split("\n")
  non_empty_lines = [line.strip() for line in lines if line.strip() != ""]
  clean_short_lines = [line for line in non_empty_lines if len(line) >= 10]
  string_retval = ""
  for line in clean_short_lines:
        string_retval += line + "\n"
  string_retval = re.sub(r" +", " ", string_retval)

  return string_retval

def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    bos = "{"
    eos = "}"
    test = []
    for i in newString.split("\n"):
      i = bos + i + eos
      test.append(i)
    newString = " ".join(test).strip()
    # remove punctuations
    # INTAB = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    newString = re.sub("[^{}a-zA-ZạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ]", " ", newString)
    long_words=[]
    # remove short word
    for i in newString.split():
      if len(i)>=1:
        long_words.append(i)
    return (" ".join(long_words)).strip()

# load doc into memory
def load_data(filename):
	# open the file as read only
	file = open(filename, 'r', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# save tokens to file, one dialog per line
def save_data(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w', encoding='utf-8')
	file.write(data)
	file.close()
 
def create_seq(orgText, length, reverted=False):
    text = orgText[::-1] if reverted == True else orgText
    sequences = list()
    for i in range(length, len(text)):
        seq = text[i-length:i+1]
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

def constrain(x, min, max):
    if x < min:
        return min
    elif x > max:
        return max
    else:
        return x

In [None]:
if (not os.path.exists(corpusSequenceFile)):
    # load text
    raw_text = load_data(corpusFile)

    # clean rawdata
    raw_text = raw_text_cleaner(raw_text)

    # clean
    raw_text = text_cleaner(raw_text)

    # organize into sequences of characters
    sequences = create_seq(raw_text, seq_length)

    # save sequences to file
    save_data(sequences, corpusSequenceFile)

In [None]:
# load
raw_data = load_data(corpusSequenceFile)
# print(raw_data[0:1000])
lines = raw_data.split('\n')
# print(lines[0:100])

chars = sorted(list(set(raw_data)))
# print(chars)
mapping = dict((c, i) for i, c in enumerate(chars))


# save the mapping
pickle.dump(mapping, open('name_data_mapping.pkl', 'wb'))

sequences = list()
for line in lines:
	# integer encode line
	encoded_seq = [mapping[char] for char in line]
	# store
	sequences.append(encoded_seq)

# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)


sequences = np.array(sequences)
X_train, y_train = sequences[:,:-1].copy(), sequences[:,-1].copy()


input_shape = (seq_length, vocab_size)
current_part = 0
max_part = int(len(X_train) / part_size) + 1
lastEpoch = 0
if (os.path.exists('savedEpochs/current_part.txt')):
  with open('savedEpochs/current_part.txt', 'r', encoding='utf8') as f:
    current_part = int(f.read())

Vocabulary Size: 97


In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=seq_length, trainable=True))
model.add(LSTM(512))
model.add(Dropout(0.12))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 64)            6208      
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1181696   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 97)                49761     
Total params: 1,237,665
Trainable params: 1,237,665
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.load_weights("model-epoch-035.h5")

if (os.path.exists('savedEpochs/part_%d' % current_part)):
    listEpochs = [x for x in os.listdir('savedEpochs/part_%d' % current_part) if x[:12] == 'model-epoch-' and x[-3:] == '.h5']

    if (len(listEpochs) > 0):
        lastEpoch = max([int(x[12:-3]) for x in listEpochs])
        lastEpochFile = 'savedEpochs/part_%d/model-epoch-%03d.h5' % (current_part, lastEpoch)
        # load weights
        model.load_weights(lastEpochFile)
        print("CONTINUE TRAINING FROM PART %d EPOCH %03d......" % (current_part, lastEpoch))
    else:
        lastEpoch = 0
# model.load_weights('model-epoch-050.h5')

In [None]:
print('total-train-data:', len(X_train))
print('total-part:', max_part)

total-train-data: 18641799
total-part: 19


In [None]:
for i in range(current_part, max_part):
    with open('savedEpochs/current_part.txt', 'w', encoding='utf8') as f:
        f.write(str(i))
    
    print("\n")
    print("====================================================================")
    print("=                       TRAINING PART %03d                          =" % i)
    print("====================================================================")

    if (not os.path.exists('savedEpochs/part_%d' % i)):
        os.mkdir('savedEpochs/part_%d' % i)

    if (i > current_part):
        lastEpoch = 0

    start_point = i * part_size
    end_point = (i + 1) * part_size
    end_point = constrain(end_point, 0, len(X_train))

    X = X_train[start_point:end_point]
    y = to_categorical(y_train[start_point:end_point], num_classes=vocab_size)

    # continue checkpoint
    checkpoint = ModelCheckpoint('savedEpochs/part_%d/model-epoch-{epoch:03d}.h5' % i, period=period)

    # fit model
    model.fit(X, y, epochs=epochs, initial_epoch = lastEpoch, callbacks=[checkpoint], batch_size = batch_size)
    # model.save('model-epoch-{epoch:03d}.h5' % i, period=period)

model.save('model1.h5')



=                       TRAINING PART 000                          =

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


=                       TRAINING PART 001                          =
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


=                       TRAINING PART 002                          =
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


=                       TRAINING PART 003                          =
Epoch 1/20
Epoch 2/20
