In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from bs4 import BeautifulSoup
import requests

## Using `Tokenizer` class

* ### Get some data

In [5]:
# Define url
url = "https://madridnyc.es/blogs-interesantes-e-influyentes-espanol/"

# Define headers for requests
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}


In [6]:
# Make a request
r = requests.get(url, headers = headers)

# Create a beutiful soup
soup = BeautifulSoup(r.content)

* ### Transform the data

In [7]:
# Get all HTML <p>'s
some_texts = soup.find_all('p')

In [8]:
# Clean the text - remove HTML tags
training_text = []

for text in some_texts:
    training_text.append(text.get_text())

* ### Tokenize

In [9]:
# Instantiate a tokenizer
tokenizer = Tokenizer(num_words = 3000, oov_token = "<OOV>")

In [10]:
# Fit the tokenizer
tokenizer.fit_on_texts(training_text)

In [11]:
# How many tokens do we have?
tokens = tokenizer.word_index
len(tokens)

1114

* ### Let's see how it works on test data!

In [12]:
# Define some test data
new_text = ['Estoy muy cansado, pero no se porque', 
            'No tenemos mucho tiempo', 
            'grandes o pequeños?', 
            'Quiero alcanzar algo increíble contigo!']

In [13]:
# Transform texts to sequences of numbers
sequences = tokenizer.texts_to_sequences(training_text)

In [14]:
# Examine the transformed output - it should be a list of numbers (tokens)
for i in sequences[1]:
    print(i, end = ' ')

371 64 81 2 35 82 3 102 2 58 6 59 31 83 372 1 204 32 373 2 1 37 134 60 24 3 38 205 61 374 84 375 6 376 377 1 135 101 1 378 379 35 27 136 

In [15]:
# Produce a test sequence
test_seq = tokenizer.texts_to_sequences(new_text)

In [16]:
# Examine the test seq
test_seq

[[1114, 32, 1114, 60, 59, 23, 1114],
 [59, 1114, 40, 202],
 [323, 61, 501],
 [1114, 912, 740, 1114, 1114]]

In [17]:
# Translate the test seq back to words
for i, sentence in enumerate(test_seq):
    print(f'Case {i + 1}')
    print(f'Original sentence: {new_text[i]}')
    print('Recreated sentence: ', end = '')
    for word in sentence:
        print(list(tokens.keys())[word - 1], end = ' ')
    print('\n')

Case 1
Original sentence: Estoy muy cansado, pero no se porque
Recreated sentence: <OOV> muy <OOV> pero no se <OOV> 

Case 2
Original sentence: No tenemos mucho tiempo
Recreated sentence: no <OOV> mucho tiempo 

Case 3
Original sentence: grandes o pequeños?
Recreated sentence: grandes o pequeños 

Case 4
Original sentence: Quiero alcanzar algo increíble contigo!
Recreated sentence: <OOV> alcanzar algo <OOV> <OOV> 



In [18]:
# As the texts come from an article about the best Spanish blogs, 
# which is written in not very personal tone, it's not surprising
# that first-person verb forms are missing in our dict

* ### Using padding 

In [21]:
# Padding adds zeros in front of the shorter sequences, so they become as long as the longest seq
pad_sequences(test_seq)

array([[1114,   32, 1114,   60,   59,   23, 1114],
       [   0,    0,    0,   59, 1114,   40,  202],
       [   0,    0,    0,    0,  323,   61,  501],
       [   0,    0, 1114,  912,  740, 1114, 1114]])

In [22]:
padded_test = pad_sequences(test_seq)

In [24]:
# Padding does not affect back-translation to words
for i, sentence in enumerate(padded_test):
    print(f'Case {i + 1}')
    print(f'Original sentence: {new_text[i]}')
    print('Recreated sentence: ', end = '')
    for word in sentence:
        print(list(tokens.keys())[word - 1], end = ' ')
    print('\n')

Case 1
Original sentence: Estoy muy cansado, pero no se porque
Recreated sentence: <OOV> muy <OOV> pero no se <OOV> 

Case 2
Original sentence: No tenemos mucho tiempo
Recreated sentence: <OOV> <OOV> <OOV> no <OOV> mucho tiempo 

Case 3
Original sentence: grandes o pequeños?
Recreated sentence: <OOV> <OOV> <OOV> <OOV> grandes o pequeños 

Case 4
Original sentence: Quiero alcanzar algo increíble contigo!
Recreated sentence: <OOV> <OOV> <OOV> alcanzar algo <OOV> <OOV> 



* ### Other parameters

#### `padding = post` & other options

In [27]:
# You can also add zeros at the end of your sentences
pad_sequences(test_seq, padding = 'post')

array([[1114,   32, 1114,   60,   59,   23, 1114],
       [  59, 1114,   40,  202,    0,    0,    0],
       [ 323,   61,  501,    0,    0,    0,    0],
       [1114,  912,  740, 1114, 1114,    0,    0]])

In [29]:
# You can also specify maximum length of a sequence
pad_sequences(test_seq, padding = 'post', maxlen = 10)

array([[1114,   32, 1114,   60,   59,   23, 1114,    0,    0,    0],
       [  59, 1114,   40,  202,    0,    0,    0,    0,    0,    0],
       [ 323,   61,  501,    0,    0,    0,    0,    0,    0,    0],
       [1114,  912,  740, 1114, 1114,    0,    0,    0,    0,    0]])

In [33]:
# ...and specify where the sentence should be truncated if it's longer than `maxlen`
pad_sequences(test_seq, padding = 'post', maxlen = 3, truncating = 'post')

array([[1114,   32, 1114],
       [  59, 1114,   40],
       [ 323,   61,  501],
       [1114,  912,  740]])