# 1. Import Library

In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import numpy as np

# 2. Mengunduh dataset shakesphere

In [3]:
!wget --no-check-certificate \
  https://huggingface.co/arnavmahapatra/gpt2-sonnet-generators/blob/main/shakespeare.txt \
  -O sonnets.txt

--2024-11-15 08:29:47--  https://huggingface.co/arnavmahapatra/gpt2-sonnet-generators/blob/main/shakespeare.txt
Resolving huggingface.co (huggingface.co)... 18.238.109.102, 18.238.109.92, 18.238.109.52, ...
Connecting to huggingface.co (huggingface.co)|18.238.109.102|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1094068 (1.0M) [text/html]
Saving to: ‘sonnets.txt’


2024-11-15 08:29:47 (4.38 MB/s) - ‘sonnets.txt’ saved [1094068/1094068]



# 3. Mendefinisikan Tokenizer dan Menyiapkan Training Data

Langkah  selanjutnya  yaitu  melakukan  proses  tokenisasi  dan  menyiapkan  data  yang  akan dilatih.  tokenisasi  adalah  proses  untuk  membagi  teks  yang  dapat  berupa  kalimat,  paragraf atau dokumen, menjadi token-token/bagian-bagian tertentu. Pada proses ini suatu kata yang ada pada  data sheet akan disimbolkan dengan angka secara acak sampai jumlah  kata yang ada  pada  datasheet  tersebut.  Pada  datasheet  ini  jumlah  katanya  yaitu  sampai  2900. Contohnya yaitu angka 1 didefinisikan untuk kata “and”, 2 untuk kata “the” dan seterusnya. Berikut adalah kodingan serta output dari proses tokenizer.

In [3]:
tokenizer = Tokenizer()

data = open('sonnets.txt').read()

corpus = data.lower().split('\n')

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

2900


In [4]:
input_sequences = []

for line in corpus:
  token_list = tokenizer.texts_to_sequences ([line])[0]
  #print("LIST"+str(token_list))
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)


##pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])

print(max_sequence_len, total_words)

input_sequences = np.array(pad_sequences(input_sequences, padding='pre', maxlen=max_sequence_len))

#create predictors and labels
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]

ys = tf.keras.utils.to_categorical (labels, num_classes=total_words)

11 2900


# 4. Mendefinisikan Arsitektur Model

Langkah selanjutnya yaitu mendefinisikan arsitektur , pada tahap ini kita menentukan berapa total kata yang ada pada artikel tersebut kemudian menentukan urutan setiap kata yang ada pada  artikel  tersebut.  Berikut  adalah  kodingan  serta  output  dari  mendefinisikan  arsitektur modelnya.

In [15]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(96)))
model.add(Dense(total_words//2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))

print(model.summary())




None


# 5. Training Data

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(xs, ys, epochs=100, verbose=1)

Epoch 1/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 143ms/step - accuracy: 0.0237 - loss: 7.7045
Epoch 2/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 145ms/step - accuracy: 0.0209 - loss: 6.4439
Epoch 3/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 139ms/step - accuracy: 0.0265 - loss: 6.3765
Epoch 4/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 138ms/step - accuracy: 0.0268 - loss: 6.3082
Epoch 5/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 137ms/step - accuracy: 0.0316 - loss: 6.2169
Epoch 6/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 139ms/step - accuracy: 0.0419 - loss: 6.0480
Epoch 7/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 142ms/step - accuracy: 0.0393 - loss: 5.9812
Epoch 8/10
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 147ms/step - accuracy: 0.0433 - loss: 5.9233
Epoch 9/10
[1m3

# 6. Membuat Perintah Untuk 100 Kata Selanjutnya

Langkah selanjutnya yaitu membuat perintah untuk RNN supaya dapat memprediksi 100kata selanjutnya pada artikel yang ingin dibuat berdasarkan datasheet yang sudah diunduh pada langkah sebelumnya. Pada langkah ini RNN akan dipancing untuk memprediksi kataselanjutnya berdasarkan seed textyang sudah ditentukan sebelumnya.

In [19]:
def predict_next_words(seed_text, next_words):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predict_x = model.predict(token_list)
    predicted = np.argmax(predict_x,axis=1)
    #predicted model.predict_classes(token list, verbose-8)
    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word

  print(seed_text)
  return seed_text

# 7. Output Artikel dengan Seed Text

In [22]:
seed_text = '1THREEPIO'
next_words = 100

generated_text = predict_next_words(seed_text, next_words)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

In [21]:
seed_text = 'why lovest thou that which not gladly'
generated_text = predict_next_words(seed_text, next_words)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45