In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [4]:
data = """In the town of Athy one Jeremy Lanigan
Battered away 'til he hadn't a pound.
His father he died and made him a man again
Left him a farm and ten acres of ground.
He gave a grand party to friends and relations
Who didn't forget him when it comes to the will,
If you'll but listen I'll make your eyes glisten
Of the rows and the ructions of Lanigan's Ball.
Six long months I spent in Dublin,
Six long months doing nothing at all.
Six long months I spent in Dublin,
Learning to dance for Lanigan's ball."""

In [5]:
print(data)

In the town of Athy one Jeremy Lanigan
Battered away 'til he hadn't a pound.
His father he died and made him a man again
Left him a farm and ten acres of ground.
He gave a grand party to friends and relations
Who didn't forget him when it comes to the will,
If you'll but listen I'll make your eyes glisten
Of the rows and the ructions of Lanigan's Ball.
Six long months I spent in Dublin,
Six long months doing nothing at all.
Six long months I spent in Dublin,
Learning to dance for Lanigan's ball.


In [6]:
tokenizer = Tokenizer()

corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

### ทำให้เป็นชุดของลำดับย่อย

In [7]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(input_sequences[:5])

[5, 1, 17, 2, 18, 19, 20, 21]
[22, 23, 24, 6, 25, 3, 26]
[27, 28, 6, 29, 4, 30, 7, 3, 31, 32]
[33, 7, 3, 34, 4, 35, 36, 2, 37]
[6, 38, 3, 39, 40, 8, 41, 4, 42]
[43, 44, 45, 7, 46, 47, 48, 8, 1, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58]
[2, 1, 59, 4, 1, 60, 2, 12, 13]
[9, 10, 11, 14, 15, 5, 16]
[9, 10, 11, 61, 62, 63, 64]
[9, 10, 11, 14, 15, 5, 16]
[65, 8, 66, 67, 12, 13]
[[5, 1], [5, 1, 17], [5, 1, 17, 2], [5, 1, 17, 2, 18], [5, 1, 17, 2, 18, 19]]


### Prepadding ข้อมูล

In [8]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

### Input sequences and label

In [9]:
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

### Label to be the categorical

In [10]:
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

### Model

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(tf.keras.layers.Embedding(total_words, 8))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_sequence_len-1)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
history = model.fit(xs, ys, epochs=1500, verbose=1)

Epoch 1/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 4.2192
Epoch 2/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0155 - loss: 4.2162     
Epoch 3/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0331 - loss: 4.2137 
Epoch 4/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0292 - loss: 4.2114 
Epoch 5/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0331 - loss: 4.2104 
Epoch 6/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0467 - loss: 4.2075 
Epoch 7/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0370 - loss: 4.2054 
Epoch 8/1500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0253 - loss: 4.2005     
Epoch 9/1500
[1m3/3[0m [32m━━━━━━

Predict next word

In [14]:
seed_text = "in the town of athy"

token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

predicted = np.argmax(model.predict(token_list), axis=-1)
print(predicted)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
[19]


In [15]:
for word, index in tokenizer.word_index.items():
    if index == predicted:
        print(word)
        break

one


In [17]:
seed_text = "sweet jeremy saw dublin"

token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

predicted = np.argmax(model.predict(token_list), axis=-1)
print(predicted)

for word, index in tokenizer.word_index.items():
    if index == predicted:
        print(word)
        break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[46]
when


### ต่อไปแต่งเป็นประโยค ทำซ้ำเพื่อเดาไปเรื่อยๆ

In [18]:
seed_text = "sweet jeremy saw dublin"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""

    for word , index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

    seed_text += " " + output_word

print(seed_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
sweet jeremy saw dublin when when of of lanigan's ball eyes ball eyes ball


---

In [19]:
data = open('irish-lyrics-eof.txt').read()
corpus = data.lower().split("\n")

tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [20]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(input_sequences[:5])

[51, 12, 96, 1217, 48, 2, 69]
[2, 11, 15, 31, 361, 8, 24, 1218]
[272, 798, 2, 204, 24, 579, 69]
[118, 35, 119, 799, 56, 24, 184]
[10, 184, 25, 23, 5, 580, 456]
[2, 184, 800, 12, 801, 4, 7, 235]
[1219, 17, 12, 75, 1220, 236, 17, 12, 75, 802]
[184, 800, 12, 801, 4, 7, 235]
[152, 3, 2, 5, 803, 6, 184]
[3, 362, 25, 49, 83, 457]
[52, 108, 5, 1221, 804]
[106, 805, 4, 806, 7, 164]
[2, 309, 7, 803, 6, 184, 56]
[1, 804, 141, 4, 9, 5, 102]
[5, 102, 15, 49, 83, 457]
[33, 141, 25, 4, 9, 4, 204, 9, 1222]
[6, 21, 33, 309, 7, 184, 56]
[458, 7, 581, 2, 363, 1223, 142]
[12, 88, 1, 65]
[1224, 807, 310, 26, 808, 142]
[364, 1, 809, 459, 31, 1225]
[311, 2, 1226, 8, 582, 583]
[3, 7, 460, 1227, 810]
[109, 1, 165, 18, 584, 23, 810]
[109, 1, 461, 153, 23, 583]
[92, 185, 811, 585, 812]
[1228, 6, 154, 1229]
[1230, 5, 365, 2, 1231, 813]
[273, 3, 814, 5, 586, 312]
[7, 74, 366, 120, 587, 186, 143]
[205, 48, 367, 368, 23, 84]
[369, 7, 1232, 6, 22, 120, 1233]
[274, 1, 815, 6, 7, 313]
[1234, 5, 1235, 588, 23, 462]
[37

In [21]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [23]:
model = Sequential()

model.add(tf.keras.layers.Embedding(total_words, 8 ,input_length=max_sequence_len-1))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_sequence_len-1)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(xs, ys, epochs=1000, verbose=1)

Epoch 1/1000




[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0591 - loss: 7.2030
Epoch 2/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0662 - loss: 6.4054
Epoch 3/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0652 - loss: 6.2828
Epoch 4/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0636 - loss: 6.2451
Epoch 5/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0668 - loss: 6.2039
Epoch 6/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0661 - loss: 6.1706
Epoch 7/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0724 - loss: 6.0481
Epoch 8/1000
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0751 - loss: 6.0354
Epoch 9/1000
[1m377/377[0m [32m━━━

KeyboardInterrupt: 

### Windowing ทำให้ได้ข้อมูลมากขึ้น เป็น window ไป

In [None]:
window_size = 10
sentences = []
alltext = []
data = open('irish-lyrics-eof.txt').read()
corpus = data.lower()
words = corpus.split(" ")

range_size = len(words) - max_sequence_len
for i in range(0, range_size):
    thissentence = ""
    for word in range(0, window_size-1):
        word = words[i+word]
        thissentence += thissentence + word
        thissentence += thissentence + " "
    sentences.append(thissentence)