In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I love my cat',
    'Is it sunny today?'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)

# คลังคำศัพท์ Corpus
word_index = tokenizer.word_index
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5, 'is': 6, 'it': 7, 'sunny': 8, 'today': 9}
[[1, 2, 3, 4], [1, 2, 3, 5], [6, 7, 8, 9]]


---

### OOV (Out-of-vocabulary Token)

In [4]:
test_data = [
    'Today is the snowy day',
    'Will it be rainy tomorrow?'
]

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)

[[9, 6], [7]]


จะเห็นว่า Token มันจะหายไป เพราะมันไม่รู้จักคำศัพท์นั้นๆ เราจึงใช้ OOV แทนจุดที่มันหายไป เพื่อคงไว้ซึ่งบริบท

In [6]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

test_sequences = tokenizer.texts_to_sequences(test_data)
print(test_sequences)
print(word_index)

[[10, 7, 1, 1, 1], [1, 8, 1, 1, 1]]
{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6, 'is': 7, 'it': 8, 'sunny': 9, 'today': 10}


### Padding เวลาที่เราจะเทรนมักจะต้องการให้ความยาวตัวอักษรที่ Train เท่ากัน

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

sequences = tokenizer.texts_to_sequences(sentences)
print('Before padded : ')
print(sequences)

# 0 ด้านหน้า
padded = pad_sequences(sequences)
print('After padded : ')
print(padded)

# 0 ด้านหลัง
padded = pad_sequences(sequences, padding='post')
print('After padded (post) : ')
print(padded)

Before padded : 
[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2], [9, 10, 11, 12, 13, 14, 15, 2]]
After padded : 
[[ 0  0  0  2  3  4  5  6]
 [ 0  0  0  2  3  4  7  6]
 [ 0  0  0  0  3  8  5  2]
 [ 9 10 11 12 13 14 15  2]]
After padded (post) : 
[[ 2  3  4  5  6  0  0  0]
 [ 2  3  4  7  6  0  0  0]
 [ 3  8  5  2  0  0  0  0]
 [ 9 10 11 12 13 14 15  2]]
After padded (post, maxlen=5) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [12 13 14 15  2]]


In [14]:
# 0 ด้านหลัง และ จำกัดความยาว
padded = pad_sequences(sequences, padding='post', maxlen=5)
print('After padded (post, maxlen=5) : ')
print(padded)

# เกิดปัญหาตัดข้อมูลด้านหน้าแทน แต่เราต้องการจะตัดหลัง
padded = pad_sequences(sequences, padding='post', maxlen=5, truncating='post')
print('After padded (post, maxlen=5, truncating=post) : ')
print(padded)

After padded (post, maxlen=5) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [12 13 14 15  2]]
After padded (post, maxlen=5, truncating=post) : 
[[ 2  3  4  5  6]
 [ 2  3  4  7  6]
 [ 3  8  5  2  0]
 [ 9 10 11 12 13]]


---

### Remove the HTML with BeautifulSoup

In [15]:
from bs4 import BeautifulSoup
soup = BeautifulSoup('<p>Hello, <b>World</b></p>')
sentence = soup.get_text()
print(sentence)

Hello, World


### Remove stopword and comma วรรคตอน

In [16]:
import string
table = str.maketrans('', '', string.punctuation)
words = sentence.split()
filtered_sentence = ""
for word in words:
    word = word.translate(table)
    if word.isalpha():
        filtered_sentence += word + " "
sentences.append(filtered_sentence)

---

### IMDB Reviews

In [18]:
from bs4 import BeautifulSoup
import tensorflow_datasets as tfds
import string

table = str.maketrans('', '', string.punctuation)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren"]

imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split='train'))
for item in train_data:
    sentence = str(item['text'].decode('utf-8').lower())
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence += word + " "
    imdb_sentences.append(filtered_sentence)
    
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

print(tokenizer.word_index)

  soup = BeautifulSoup(sentence)




In [19]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', split=(tfds.Split.TRAIN, tfds.Split.TEST), with_info=True, as_supervised=True)

encoder = info.features['text'].encoder
print('Vocabulary size: {}'.format(encoder.vocab_size))

print(encoder.subwords)



[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\folks\tensorflow_datasets\imdb_reviews\subwords8k\1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\folks\tensorflow_datasets\imdb_reviews\subwords8k\1.0.0.incompleteY3POLP\imdb_reviews-train…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\folks\tensorflow_datasets\imdb_reviews\subwords8k\1.0.0.incompleteY3POLP\imdb_reviews-test.…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\folks\tensorflow_datasets\imdb_reviews\subwords8k\1.0.0.incompleteY3POLP\imdb_reviews-unsup…



[1mDataset imdb_reviews downloaded and prepared to C:\Users\folks\tensorflow_datasets\imdb_reviews\subwords8k\1.0.0. Subsequent calls will reuse this data.[0m
Vocabulary size: 8185


---