# PROJECT FINAL #

In [14]:
# import librairy
import tensorflow as tf
import string
import requests
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils import to_categorical
from random import randint
import re

## Extrat Data

In [15]:
# import corpus
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg as gut

#print(gut.fileids())

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/charlotteportenseigne/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [16]:
# get text
emma_text = nltk.corpus.gutenberg.raw('austen-emma.txt')
emma_text = emma_text[28:-2]

persuasion_text = nltk.corpus.gutenberg.raw('austen-persuasion.txt')
persuasion_text = persuasion_text[35:-6]

sense_text = nltk.corpus.gutenberg.raw('austen-sense.txt')
sense_text = sense_text[45:-3]

bible_text = nltk.corpus.gutenberg.raw('bible-kjv.txt')
bible_text = bible_text[24:-3]

blake_text = nltk.corpus.gutenberg.raw('blake-poems.txt')
blake_text = blake_text[33:-3]

bryant_text = nltk.corpus.gutenberg.raw('bryant-stories.txt')
bryant_text = bryant_text[61:-3]

busterbrown_text = nltk.corpus.gutenberg.raw('burgess-busterbrown.txt')
busterbrown_text = busterbrown_text[63:-4]

alice_text = nltk.corpus.gutenberg.raw('carroll-alice.txt')
alice_text = alice_text[58:]

ball_text = nltk.corpus.gutenberg.raw('chesterton-ball.txt')
ball_text = ball_text[51:-67]

brown_text = nltk.corpus.gutenberg.raw('chesterton-brown.txt')
brown_text = brown_text[56:-2]

thursday_text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
thursday_text = thursday_text[53:-2]

parents_text = nltk.corpus.gutenberg.raw('edgeworth-parents.txt')
parents_text = parents_text[50:-2]

moby_dick_text = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
moby_dick_text = moby_dick_text[41:-2]

paradise_text = nltk.corpus.gutenberg.raw('milton-paradise.txt')
paradise_text = paradise_text[41:-15]

caesar_text = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
caesar_text = caesar_text[62:-3]

hamlet_text = nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
hamlet_text = hamlet_text[55:]

macbeth_text = nltk.corpus.gutenberg.raw('shakespeare-macbeth.txt')
macbeth_text = macbeth_text[56:]

leaves_text = nltk.corpus.gutenberg.raw('whitman-leaves.txt')
leaves_text = leaves_text[41:-2]

## Cleaning Data + Sentences to list of words

In [17]:
#clean function
def clean_text(doc):
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation) # take off punctuation
    tokens = [w.translate(table) for w in tokens]
    tokens = [ word for word in tokens if word.isalpha()] # select only alphannumeric letter
    tokens = [word.lower() for word in tokens] # converte letter into lower case
    return tokens

In [18]:
emma_tokens = clean_text(emma_text)
persuasion_tokens = clean_text(persuasion_text)
sense_tokens = clean_text(sense_text)
bible_tokens = clean_text(bible_text) 
blake_tokens = clean_text(blake_text) 
bryant_tokens = clean_text(bryant_text) 
busterbrown_tokens = clean_text(busterbrown_text)
alice_tokens = clean_text(alice_text)
ball_tokens = clean_text(ball_text) 
brown_tokens = clean_text(brown_text)
thursday_tokens = clean_text(thursday_text)
parents_tokens = clean_text(parents_text)
moby_dick_tokens = clean_text(moby_dick_text)
paradise_tokens = clean_text(paradise_text)
caesar_tokens = clean_text(caesar_text) 
hamlet_tokens = clean_text(hamlet_text)
macbeth_tokens = clean_text(macbeth_text)
leaves_tokens = clean_text(leaves_text)
#print(emma_tokens[:50])

In [19]:
# merge lists together
gutemberg_tokens = emma_tokens + persuasion_tokens + sense_tokens + bible_tokens + blake_tokens + bryant_tokens + busterbrown_tokens + alice_tokens + ball_tokens + brown_tokens + thursday_tokens + parents_tokens + moby_dick_tokens + paradise_tokens + caesar_tokens + hamlet_tokens + macbeth_tokens + leaves_tokens 

In [20]:
# number of words
n_words = len(gutemberg_tokens)
unique_words = len(set(gutemberg_tokens))
print(f'Total words : ', n_words)
print(f'Unique words : ', unique_words)

Total words :  2102324
Unique words :  53322


## Prepare for train model ##

In [21]:
# take 50 words and predict the 51 words
lenght = 50 + 1
lines = []

for i in range(lenght, len(gutemberg_tokens)):
    seq = gutemberg_tokens[i-lenght:i]
    line = ' '.join(seq)
    lines.append(line)
    if i > 200000:  # we only do 200,000 words because of the RAM
        break


In [22]:
import pickle
# saving
with open('lines.pickle', 'wb') as handle:
    pickle.dump(lines, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('lines.pickle', 'rb') as handle:
    lines = pickle.load(handle)
# print(len(lines))

## Build LSTM Model and Prepare `x` and `y`

In [23]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
# tokenize sentence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) # kind of words embedding into integer
sequences = tokenizer.texts_to_sequences(lines)

In [25]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [26]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1] # select all the column except the last one, select only the last column


In [27]:
X[0] # vector, kind of word embedding, the digit represent a word

array([ 2415,     6,   351,     6,    43,   112,   491,   754,     3,
         960,    22,     5,   528,   157,     3,   183,   777,   139,
           2,  3862,    94,     4,     1,   242,  2162,     4,  2759,
           3,    14,   585,   919, 10474,   274,    11,     1,   269,
          22,    24,    70,     2,   817,    53, 10472,     8,    10,
           7,     1,  1949,     4,     1])

In [28]:
# index for each word
tokenizer.word_index 

{'the': 1,
 'to': 2,
 'and': 3,
 'of': 4,
 'a': 5,
 'i': 6,
 'was': 7,
 'her': 8,
 'it': 9,
 'she': 10,
 'in': 11,
 'not': 12,
 'be': 13,
 'had': 14,
 'he': 15,
 'that': 16,
 'you': 17,
 'as': 18,
 'for': 19,
 'have': 20,
 'but': 21,
 'with': 22,
 'his': 23,
 'very': 24,
 'is': 25,
 'at': 26,
 'mr': 27,
 'so': 28,
 'all': 29,
 'could': 30,
 'been': 31,
 'would': 32,
 'him': 33,
 'on': 34,
 'no': 35,
 'were': 36,
 'mrs': 37,
 'by': 38,
 'my': 39,
 'they': 40,
 'any': 41,
 'which': 42,
 'emma': 43,
 'do': 44,
 'from': 45,
 'must': 46,
 'there': 47,
 'miss': 48,
 'this': 49,
 'will': 50,
 'more': 51,
 'me': 52,
 'or': 53,
 'what': 54,
 'an': 55,
 'much': 56,
 'such': 57,
 'them': 58,
 'if': 59,
 'than': 60,
 'said': 61,
 'one': 62,
 'are': 63,
 'being': 64,
 'every': 65,
 'when': 66,
 'their': 67,
 'am': 68,
 'should': 69,
 'little': 70,
 'think': 71,
 'only': 72,
 'never': 73,
 'well': 74,
 'we': 75,
 'did': 76,
 'how': 77,
 'might': 78,
 'thing': 79,
 'your': 80,
 'who': 81,
 'own': 82,

In [29]:
vocab_size = len(tokenizer.word_index) +1

In [30]:
y = to_categorical(y, num_classes=vocab_size)

In [31]:
seq_length = X.shape[1]

In [32]:
# saving
import pickle
with open('seq_length.pickle', 'wb') as handle:
    pickle.dump(seq_length, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('seq_length.pickle', 'rb') as handle:
    seq_length = pickle.load(handle)

## LSTM Model ##

In [33]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(vocab_size, activation = 'softmax'))
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            523750    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10475)             1

In [35]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [55]:
# train
model.fit(X, y, batch_size = 100, epochs = 4) # We can change the batch and the epoch
# epochs 25 takes 2h30, for /199951 and 100 epochs 0,46 accurancy

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7ff111d70fd0>

In [None]:
# batch_size = 100, epochs = 4 - accuracy 0.1279, time: 22min 40
# batch_size = 100, epochs = 5 - accuracy 0.1362 , time:32min 11
# batch_size = 150, epochs = 5 - accuracy 0.1278 , time:30min 28
# bach_size = 50, epochs = 5 - accuracy 0.1255, time: 40m
# batch_size = 100, epochs = 10 - accuracy: 0.1565 - 59min
# batch_size = 200, epochs = 10 - accuracy: 0.1440 - 59min
# batch_size = 250, epochs = 20 - accuracy: 0.1758 - 95min
# batch_size = 100, epochs = 20 - accuracy: 0.1721 - 117min

In [60]:
#import joblib
from keras.models import load_model

#joblib.dump(gutemberg_tokens, "texts_model.joblib")
model.save("prediction_model.h5")

## Predict text ##

In [57]:
# function to predict text
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
    text = []

    for _ in range(n_words): # how many words you want to generate
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating = 'pre')

        y_predict = model.predict_classes(encoded) # return the interger value of the word

        predicted_word = ''
        for word, index in tokenizer.word_index.items(): # find the word by the integer word
            if index == y_predict:
                predicted_word = word
                break
        seed_text = seed_text + ' ' + predicted_word
        text.append(predicted_word)
    return ' '.join(text)

In [58]:
# predict sentence
# generate_text_seq(model, tokenizer, seq_length, seed_text = lines[12343], n_words)
generate_text_seq(model, tokenizer, seq_length, lines[0], 3)

'days of the'

## Test the model ##

In [61]:
lines[3]

'i emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a'

In [62]:
# compare part of speech
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(lines[3][-14:])
for token in doc:
    POS = token.pos_
    print(token.text, token.pos_)

# predict text
pred_doc = nlp(generate_text_seq(model, tokenizer, seq_length, lines[0], 3))
for token in pred_doc:
    pred_POS = token.pos_
    print(token.text, token.pos_)

if POS == pred_POS:
    print("They have the same Part Of Speech")
else:
    print("They have different Part Of Speech")

daughters NOUN
of ADP
a PRON
days NOUN
of ADP
the PRON
They have the same Part Of Speech


## Visualization of the prediction ##

In [63]:
# visualization of the actual sentence and the sentence with the predicted words
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
# sentence with the predicted words
pred_text = lines[0]+ ' ' + generate_text_seq(model, tokenizer, seq_length, lines[0], 3)
pred_doc = nlp(pred_text)
pred_sentence_spans = list(pred_doc.sents)
displacy.render(pred_sentence_spans, style="dep")

# actual sentence in the text
text = lines[0] + ' ' + lines[3][-14:]
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")