In [2]:
#model implemented from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

import tensorflow as tf

import numpy as np
import os
import time
import string
import glob


from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model
from pickle import load
from random import randint
from keras.preprocessing.sequence import pad_sequences


# load
#path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
#print(path_to_file)
text = open('politics.txt', 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')
print(text[:250])
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')



Length of text: 1119265 characters
Defection timed to hit tax pledge

With impeccable and precisely-calculated timing, Tory defector Robert Jackson and his new Labour bosses have attempted to overshadow Michael Howard's latest announcement on taxation and spending.

With just about ev
86 unique characters


In [3]:
#clean text

def clean_text(text):

  # replace '--' with a space ' '
  doc = text.replace('--', ' ')
	# split into tokens by white space
  tokens = doc.split()
	# remove punctuation from each token
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
	# make lower case
  tokens = [word.lower() for word in tokens]


  #add 'START_TOKEN' in front of the sentences




  return tokens


# clean document
tokens = clean_text(text)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


['defection', 'timed', 'to', 'hit', 'tax', 'pledge', 'with', 'impeccable', 'and', 'preciselycalculated', 'timing', 'tory', 'defector', 'robert', 'jackson', 'and', 'his', 'new', 'labour', 'bosses', 'have', 'attempted', 'to', 'overshadow', 'michael', 'howards', 'latest', 'announcement', 'on', 'taxation', 'and', 'spending', 'with', 'just', 'about', 'everyone', 'in', 'westminster', 'now', 'working', 'towards', 'a', 'may', 'general', 'election', 'mr', 'howard', 'is', 'eager', 'to', 'map', 'out', 'some', 'clear', 'and', 'distinctive', 'policies', 'aimed', 'at', 'finally', 'shifting', 'the', 'tories', 'resolutely', 'depressing', 'poll', 'showings', 'the', 'big', 'idea', 'is', 'his', 'savings', 'on', 'waste', 'and', 'bureaucracy', 'which', 'mr', 'howard', 'has', 'pledged', 'to', 'plough', 'back', 'into', 'public', 'services', 'and', 'tax', 'cuts', 'and', 'it', 'was', 'virtually', 'certain', 'his', 'pledge', 'on', 'tax', 'cuts', 'was', 'meant', 'to', 'be', 'the', 'core', 'message', 'from', 'his

In [4]:

# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))


Total Sequences: 186236


In [5]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()


# save sequences to file
out_filename = 'politics_token_clean.txt'
save_doc(sequences, out_filename)

In [8]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'politics_token_clean.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=seq_length))
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(512))
model.add(Dense(512, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=50)

# save the model to file
model.save('word_rnn.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           3433800   
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 512)           1665024   
_________________________________________________________________
lstm_3 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 11446)             5871798   
Total params: 13,332,478
Trainable params: 13,332,478
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50

KeyboardInterrupt: ignored

In [7]:
# load
in_filename = 'politics_token_clean.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')


seq_length = len(lines[0].split()) - 1

#load model 
model=load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
  result = list()
  in_text = seed_text
	# generate a fixed number of words
  
  for _ in range(n_words):
		# encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
    yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
		# append to input
    in_text += ' ' + out_word 
    result.append(out_word)
    if out_word=='ENDTOKEN':
      break;
  
  return ' '.join(result)


seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

OSError: ignored

In [None]:
# load
in_filename = 'politics_token_clean.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')


seq_length = len(lines[0].split()) - 1

#load model 
model=load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
  result = list()
  in_text = seed_text
	# generate a fixed number of words
  
  for _ in range(n_words):
		# encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
    yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
		# append to input
    in_text += ' ' + out_word 
    result.append(out_word)
    if out_word=='ENDTOKEN':
      break;
  
  return ' '.join(result)


seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)