In [None]:
#Install required package
!pip install PyMuPDF python-docx keybert transformers h5py



In [None]:
#Install required packages
!pip install tensorflowjs



In [None]:
# Import required packages
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflowjs as tfjs

In [None]:
# Import required packages
import fitz  # PyMuPDF
from docx import Document
import random
from keybert import KeyBERT
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import json
from google.colab import files
import os
import gzip
import numpy as np
import h5py
from google.colab import files


  from tqdm.autonotebook import tqdm, trange


In [None]:
# Define the model
class QuestionGenerator:
    def __init__(self, t5_qg_model_name='t5-small'):
        self.kw_model = KeyBERT()
        self.tokenizer_qg = T5Tokenizer.from_pretrained(t5_qg_model_name)
        self.model_qg = T5ForConditionalGeneration.from_pretrained(t5_qg_model_name)

        self.model_qg = torch.quantization.quantize_dynamic(
            self.model_qg, {torch.nn.Linear}, dtype=torch.qint8
        )

    def extract_keywords(self, text, num_keywords=5):
        keywords = self.kw_model.extract_keywords(text, top_n=num_keywords)
        return [keyword[0] for keyword in keywords]

    def generate_question_with_choices(self, keywords, context):
        answer_keyword = random.choice(keywords)
        other_keywords = [kw for kw in keywords if kw != answer_keyword]

        input_text = f"generate question: {answer_keyword} context: {context}"
        input_ids = self.tokenizer_qg.encode(input_text, return_tensors='pt')

        outputs = self.model_qg.generate(input_ids, max_length=50, num_beams=4, num_return_sequences=1, early_stopping=True)
        question = self.tokenizer_qg.decode(outputs[0], skip_special_tokens=True)

        distractors = random.sample(other_keywords, min(len(other_keywords), 3))
        choices = [answer_keyword] + distractors
        random.shuffle(choices)

        return question, answer_keyword, choices

    def generate_questions(self, passage, num_questions=5):
        keywords = self.extract_keywords(passage)
        num_questions = min(num_questions, len(keywords))
        questions = []

        for idx in range(num_questions):
            question, answer, choices = self.generate_question_with_choices(keywords, passage)
            questions.append({
                'question': question,
                'answer': answer,
                'choices': choices
            })

        return questions

    def save_model_to_json(self, json_path):
        model_state_dict = self.model_qg.state_dict()
        model_state_dict = {k: str(v) for k, v in model_state_dict.items()}
        tokenizer_state_dict = self.tokenizer_qg.get_vocab()

        save_dict = {
            'model_state_dict': model_state_dict,
            'tokenizer_state_dict': tokenizer_state_dict,
            'config': self.model_qg.config.to_dict()
        }

        with open(json_path, 'w') as f:
            json.dump(save_dict, f, separators=(',', ':'))

        print(f"Model and tokenizer saved to {json_path}")
    def save_model_to_h5(self, h5_file):
        model_state_dict = self.model_qg.state_dict()
        model_state_dict = {k: v.numpy() for k, v in model_state_dict.items()}
        with h5py.File(h5_file, 'w') as f:
            for key, value in model_state_dict.items():
                f.create_dataset(key, data=value)

        print(f"Model saved to {h5_file}")

    def truncate_text(self, text, max_words=350):
        words = text.split()
        truncated_text = ' '.join(words[:max_words])
        return truncated_text

    def read_pdf(self, pdf_path):
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
            if len(text.split()) >= 350:
                break
        return self.truncate_text(text)

    def read_word(self, docx_path):
        doc = Document(docx_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
            if len(text.split()) >= 350:
                break
        return self.truncate_text(text)

    def read_file(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.pdf':
            return self.read_pdf(file_path)
        elif file_extension == '.docx':
            return self.read_word(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")

In [None]:
qg = QuestionGenerator()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
#Testing the model
input_file = 'egypt.pdf'
file_text = qg.read_file(input_file)
generated_questions = qg.generate_questions(file_text)

In [None]:
#Generate result
for idx, question_data in enumerate(generated_questions):
    print(f"\nQuestion {idx+1}: {question_data['question']}")
    print(f"Correct Answer: {question_data['answer']}")
    print(f"Choices: {question_data['choices']}")


Question 1: Egyptians lived in narrow bands of land on each side of the Nile
Correct Answer: egyptians
Choices: ['river', 'nile', 'ethiopia', 'egyptians']

Question 2: It begins near the equator in Africa and flows north to the Mediterranean Sea
Correct Answer: river
Choices: ['nile', 'egypt', 'river', 'egyptians']

Question 3: Ethiopia caused the Nile to flood every summer
Correct Answer: ethiopia
Choices: ['ethiopia', 'nile', 'egypt', 'river']

Question 4: The Nile River fed Egyptian civilization for hundreds of years
Correct Answer: egypt
Choices: ['nile', 'ethiopia', 'egypt', 'river']

Question 5: Egyptians lived in narrow bands of land on each side of the Nile
Correct Answer: egyptians
Choices: ['ethiopia', 'river', 'nile', 'egyptians']


In [None]:
# Save model
model.save('my_model.h5')

In [None]:
!tensorflowjs_converter --input_format=keras my_model.h5 tfjs_model


In [None]:
from google.colab import files


In [None]:
files.download('my_model.h5')

In [None]:
!zip -r tfjs_model.zip tfjs_model
files.download('tfjs_model.zip')

In [None]:
# Menyimpan model ke JSON dengan ukuran file yang dikelola
qg.save_model_to_json('question_generator_model.json')