In [None]:
!pip install tensorflow tensorflowjs datasets

Collecting tensorflowjs
  Downloading tensorflowjs-4.20.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-decision-forests>=1.5.0 (from tensorflowjs)
  Downloading tensorflow_decision_forests-1.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting packaging (from tensorflow)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp3

In [None]:
from datasets import load_dataset

# Memuat dataset RACE
dataset = load_dataset("race", "all")

In [None]:
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import pandas as pd

def create_dataset(dataset_split, num_samples=1000):
    data_rows = []

    for i in tqdm(range(min(len(dataset_split), num_samples))):
        curr_context = dataset_split[i]['article']
        curr_question = dataset_split[i]['question']

        all_answers = dataset_split[i]['options']
        correct_answer_index = ord(dataset_split[i]['answer']) - 65

        curr_correct = all_answers.pop(correct_answer_index)
        curr_incorrect1 = all_answers[0]
        curr_incorrect2 = all_answers[1]
        curr_incorrect3 = all_answers[2]

        data_rows.append({
            'context': curr_context,
            'question': curr_question,
            'correct': curr_correct,
            'incorrect1': curr_incorrect1,
            'incorrect2': curr_incorrect2,
            'incorrect3': curr_incorrect3
        })

    return pd.DataFrame(data_rows)

# Membuat DataFrame dari dataset RACE dengan hanya 1000 data
race_train_df = create_dataset(dataset['validation'], num_samples=1000)
race_test_df = create_dataset(dataset['test'], num_samples=1000)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
race_train_df = create_dataset(dataset['validation'])
race_test_df = create_dataset(dataset['test'])

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, TimeDistributed
from tensorflow.keras.models import Model

# Hyperparameters
VOCAB_SIZE = 1000  # Mengurangi ukuran vocab
EMBEDDING_DIM = 32  # Mengurangi dimensi embedding
LSTM_UNITS = 32  # Mengurangi unit LSTM
MAX_LEN = 4  # Mengurangi panjang maksimum

# Input layer
input_layer = Input(shape=(MAX_LEN,))

# Embedding layer
embedding_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM)(input_layer)

# LSTM layer
lstm_layer = LSTM(LSTM_UNITS, return_sequences=True)(embedding_layer)

# TimeDistributed Dense layer for each output
question_output = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'), name='question_output')(lstm_layer)
answer_output = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'), name='answer_output')(lstm_layer)
distractor1_output = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'), name='distractor1_output')(lstm_layer)
distractor2_output = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'), name='distractor2_output')(lstm_layer)
distractor3_output = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'), name='distractor3_output')(lstm_layer)

# Model
model = Model(inputs=input_layer, outputs=[question_output, answer_output, distractor1_output, distractor2_output, distractor3_output])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[['accuracy'], ['accuracy'], ['accuracy'], ['accuracy'], ['accuracy']])

model.summary()


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Inisialisasi Tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(race_train_df['context'])

# Tokenisasi dan padding input
train_sequences = tokenizer.texts_to_sequences(race_train_df['context'])
train_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Tokenisasi dan padding output
question_sequences = tokenizer.texts_to_sequences(race_train_df['question'])
answer_sequences = tokenizer.texts_to_sequences(race_train_df['correct'])
distractor1_sequences = tokenizer.texts_to_sequences(race_train_df['incorrect1'])
distractor2_sequences = tokenizer.texts_to_sequences(race_train_df['incorrect2'])
distractor3_sequences = tokenizer.texts_to_sequences(race_train_df['incorrect3'])

question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
distractor1_padded = pad_sequences(distractor1_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
distractor2_padded = pad_sequences(distractor2_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
distractor3_padded = pad_sequences(distractor3_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Expand dims to match the shape required by sparse_categorical_crossentropy
question_padded = np.expand_dims(question_padded, axis=-1)
answer_padded = np.expand_dims(answer_padded, axis=-1)
distractor1_padded = np.expand_dims(distractor1_padded, axis=-1)
distractor2_padded = np.expand_dims(distractor2_padded, axis=-1)
distractor3_padded = np.expand_dims(distractor3_padded, axis=-1)

# Melatih model
model.fit(train_padded,
          {'question_output': question_padded, 'answer_output': answer_padded,
           'distractor1_output': distractor1_padded, 'distractor2_output': distractor2_padded, 'distractor3_output': distractor3_padded},
          epochs=3, batch_size=16)


Epoch 1/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - answer_output_accuracy: 0.1065 - distractor1_output_accuracy: 0.1713 - distractor2_output_accuracy: 0.1618 - distractor3_output_accuracy: 0.1425 - loss: 34.2818 - question_output_accuracy: 0.1078
Epoch 2/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - answer_output_accuracy: 0.2390 - distractor1_output_accuracy: 0.2251 - distractor2_output_accuracy: 0.2369 - distractor3_output_accuracy: 0.2262 - loss: 27.0092 - question_output_accuracy: 0.2164
Epoch 3/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - answer_output_accuracy: 0.2412 - distractor1_output_accuracy: 0.2224 - distractor2_output_accuracy: 0.2335 - distractor3_output_accuracy: 0.2341 - loss: 23.0539 - question_output_accuracy: 0.2120


<keras.src.callbacks.history.History at 0x7f05d060c310>

In [None]:
def predict_from_paragraph(paragraph, tokenizer, model, max_len=24):
    # Tokenisasi dan padding paragraf baru
    sequence = tokenizer.texts_to_sequences([paragraph])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # Membuat prediksi menggunakan model
    predictions = model.predict(padded_sequence)

    # Mengambil indeks dengan probabilitas tertinggi
    predicted_question = np.argmax(predictions[0], axis=-1)
    predicted_answer = np.argmax(predictions[1], axis=-1)
    predicted_distractor1 = np.argmax(predictions[2], axis=-1)
    predicted_distractor2 = np.argmax(predictions[3], axis=-1)
    predicted_distractor3 = np.argmax(predictions[4], axis=-1)

    # Mengonversi indeks kembali ke kata
    predicted_question_text = tokenizer.sequences_to_texts(predicted_question)
    predicted_answer_text = tokenizer.sequences_to_texts(predicted_answer)
    predicted_distractor1_text = tokenizer.sequences_to_texts(predicted_distractor1)
    predicted_distractor2_text = tokenizer.sequences_to_texts(predicted_distractor2)
    predicted_distractor3_text = tokenizer.sequences_to_texts(predicted_distractor3)

    return {
        'question': predicted_question_text[0],
        'answer': predicted_answer_text[0],
        'distractor1': predicted_distractor1_text[0],
        'distractor2': predicted_distractor2_text[0],
        'distractor3': predicted_distractor3_text[0]
    }

In [None]:
# Paragraf baru untuk prediksi
new_paragraph = "Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects  they thought they had.\nGiven that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs"

# Menggunakan fungsi untuk memprediksi output dari paragraf baru
predictions = predict_from_paragraph(new_paragraph, tokenizer, model, max_len=MAX_LEN)

# Menampilkan hasil prediksi
print("Generated Question:", predictions['question'])
print("Correct Answer:", predictions['answer'])
print("Distractor 1:", predictions['distractor1'])
print("Distractor 2:", predictions['distractor2'])
print("Distractor 3:", predictions['distractor3'])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Generated Question: <OOV> <OOV> <OOV> <OOV>
Correct Answer: <OOV> <OOV> <OOV> <OOV>
Distractor 1: <OOV> <OOV> <OOV> <OOV>
Distractor 2: <OOV> <OOV> <OOV> <OOV>
Distractor 3: <OOV> <OOV> <OOV> <OOV>


In [None]:
model.save('my_model.h5')



In [None]:
!tensorflowjs_converter --input_format=keras my_model.h5 tfjs_model

failed to lookup keras version from the file,
    this is likely a weight only file


In [None]:
from google.colab import files

In [None]:
files.download('my_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r tfjs_model.zip tfjs_model
files.download('tfjs_model.zip')

  adding: tfjs_model/ (stored 0%)
  adding: tfjs_model/model.json (deflated 85%)
  adding: tfjs_model/group1-shard1of1.bin (deflated 8%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>