In [1]:
#load data into memory
def load_data(filename):
    #open the file as read only
    file = open(filename, 'r')
    #read all text
    text = file.read()
    #close file
    file.close()
    return text

In [2]:
#loading republic data
input_filename="republic_clean.txt"
doc = load_data(input_filename)
#sanity check
doc[:200]

'BOOK I. The Republic opens with a truly Greek scene--a festival in\nhonour of the goddess Bendis which is held in the Piraeus; to this is\nadded the promise of an equestrian torch-race in the evening. T'

In [3]:
#cleaning data
import string
#cleaning document to clean tokens
def clean_data(data):
    #replace '\r' with ''
    data = data.replace('\r',' ')
    #replace '\n' with ''
    data = data.replace('\n',' ')
    #replace '--' with a space ' '
    data = data.replace('--',' ')
    #split into tokens by white space
    tokens = data.split()
    #remove punctuations from each token
    #most efficient way to remove punctuation higher versions should use str.maketrans('', '', string.punctuation)
    tokens=[w.translate(str.maketrans('', '', string.punctuation)) for w in tokens] 
    #remove non-alphabetic words
    tokens=[word for word in tokens if word.isalpha()]
    #make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [4]:
#clean doc
tokens = clean_data(doc)
print('Total tokens %d'% len(tokens))
print('Unique tokens %d'% len(set(tokens)))
tokens[:200]

Total tokens 210245
Unique tokens 10270


['book',
 'i',
 'the',
 'republic',
 'opens',
 'with',
 'a',
 'truly',
 'greek',
 'scene',
 'a',
 'festival',
 'in',
 'honour',
 'of',
 'the',
 'goddess',
 'bendis',
 'which',
 'is',
 'held',
 'in',
 'the',
 'piraeus',
 'to',
 'this',
 'is',
 'added',
 'the',
 'promise',
 'of',
 'an',
 'equestrian',
 'torchrace',
 'in',
 'the',
 'evening',
 'the',
 'whole',
 'work',
 'is',
 'supposed',
 'to',
 'be',
 'recited',
 'by',
 'socrates',
 'on',
 'the',
 'day',
 'after',
 'the',
 'festival',
 'to',
 'a',
 'small',
 'party',
 'consisting',
 'of',
 'critias',
 'timaeus',
 'hermocrates',
 'and',
 'another',
 'this',
 'we',
 'learn',
 'from',
 'the',
 'first',
 'words',
 'of',
 'the',
 'timaeus',
 'when',
 'the',
 'rhetorical',
 'advantage',
 'of',
 'reciting',
 'the',
 'dialogue',
 'has',
 'been',
 'gained',
 'the',
 'attention',
 'is',
 'not',
 'distracted',
 'by',
 'any',
 'reference',
 'to',
 'the',
 'audience',
 'nor',
 'is',
 'the',
 'reader',
 'further',
 'reminded',
 'of',
 'the',
 'extrao

In [5]:
#split the tokens further into sequences
length = 50 + 1 #50 words considered to build a context and predict the next word. 
#consider building another model as an extension which considers self-contained sentences using truncations and padding
sequences = list()
for i in range(length,len(tokens)):
    seq = tokens[i-length:i] #in this scenario 0-50, 1-51, 2-52 and so on. Try to build another model using 0-50, 51-101 etc
    #convert it into a line
    line = ' '.join(seq)
    #store them in the sequences list
    sequences.append(line)
print('Total sequences %d'%len(sequences))

Total sequences 210194


In [6]:
sequences[0]

'book i the republic opens with a truly greek scene a festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after'

In [11]:
len(sequences[0])

263

In [7]:
#saving the sequences to a file for re-use
def save_doc(lines, filename):
    doc = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(doc)
    file.close()
output_filename='republic_sequences.txt'
save_doc(sequences,output_filename)

In [8]:
#Training our language model
#We are using Neural language model
from pickle import dump
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [9]:
#integer encode sequences of text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequence = tokenizer.texts_to_sequences(sequences)

In [13]:
#vocabulary
vocab_size = len(tokenizer.word_index)+1

In [16]:
#separate into input and output
import numpy as np
sequence = np.asarray(sequence)
x,y = sequence[:,:-1], sequence[:,-1]
y = to_categorical(y, num_classes = vocab_size) #converting numerical single column data into a one-hot encoder
seq_length = x.shape[1] #defining sequence length parameter with the number of columns

In [19]:
print(len(x[0]))
x.shape

50


(210194, 50)

In [20]:
#define model
model = Sequential()
model.add(Embedding(vocab_size,50,input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            513550    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10271)             1037371   
Total params: 1,701,821
Trainable params: 1,701,821
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#fit model
model.fit(x,y,batch_size=128, epochs=100)

Train on 210194 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

KeyboardInterrupt: 

In [None]:
#save the model to a file. This can be used later
model.save(language_model.h5)
dump(tokenizer,open('tokenizer.pkl','wb'))