In [None]:
# Install requireds Packages
!pip install PyMuPDF python-docx

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx, PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.5 PyMuPDFb-1.24.3 python-docx-1.1.2


In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m204.8/227.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.1

In [None]:
# Import Packages
import fitz  # PyMuPDF
from docx import Document
import random
from keybert import KeyBERT
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import json
from google.colab import files
import os

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Define the model
class QuestionGenerator:
    def __init__(self, t5_qg_model_name='mrm8488/t5-base-finetuned-question-generation-ap'):
        self.kw_model = KeyBERT()
        self.tokenizer_qg = T5Tokenizer.from_pretrained(t5_qg_model_name)
        self.model_qg = T5ForConditionalGeneration.from_pretrained(t5_qg_model_name)

    def extract_keywords(self, text, num_keywords=5):
        keywords = self.kw_model.extract_keywords(text, top_n=num_keywords)
        return [keyword[0] for keyword in keywords]

    def generate_question_with_choices(self, keywords, context):
        answer_keyword = random.choice(keywords)
        other_keywords = [kw for kw in keywords if kw != answer_keyword]

        input_text = f"generate question: {answer_keyword} context: {context}"
        input_ids = self.tokenizer_qg.encode(input_text, return_tensors='pt')

        outputs = self.model_qg.generate(input_ids, max_length=50, num_beams=4, num_return_sequences=1, early_stopping=True)
        question = self.tokenizer_qg.decode(outputs[0], skip_special_tokens=True)

        distractors = random.sample(other_keywords, min(len(other_keywords), 3))
        choices = [answer_keyword] + distractors
        random.shuffle(choices)

        return question, answer_keyword, choices

    def generate_questions(self, passage, num_questions=5):
        keywords = self.extract_keywords(passage)
        num_questions = min(num_questions, len(keywords))
        questions = []

        for idx in range(num_questions):
            question, answer, choices = self.generate_question_with_choices(keywords, passage)
            questions.append({
                'question': question,
                'answer': answer,
                'choices': choices
            })

        return questions

    def save_to_json(self, questions, json_file):
        with open(json_file, 'w') as f:
            json.dump(questions, f, indent=4)
        print(f"Questions saved to {json_file}")

    def save_to_binary(self, questions, bin_file):
        torch.save(questions, bin_file)
        print(f"Questions saved to {bin_file}")

    def save_model_to_bin(self, bin_path):
        model_state_dict = self.model_qg.state_dict()
        tokenizer_state_dict = self.tokenizer_qg.get_vocab()

        save_dict = {
            'model_state_dict': model_state_dict,
            'tokenizer_state_dict': tokenizer_state_dict,
            'config': self.model_qg.config.to_dict()
        }

        torch.save(save_dict, bin_path)
        print(f"Model and tokenizer saved to {bin_path}")

    def save_model_to_json(self, json_path):
        model_state_dict = self.model_qg.state_dict()
        model_state_dict = {k: v.tolist() for k, v in model_state_dict.items()}  # Convert tensors to lists
        tokenizer_state_dict = self.tokenizer_qg.get_vocab()

        save_dict = {
            'model_state_dict': model_state_dict,
            'tokenizer_state_dict': tokenizer_state_dict,
            'config': self.model_qg.config.to_dict()
        }

        with open(json_path, 'w') as f:
            json.dump(save_dict, f)
        print(f"Model and tokenizer saved to {json_path}")

    def truncate_text(self, text, max_words=350):
        words = text.split()
        truncated_text = ' '.join(words[:max_words])
        return truncated_text

    def read_pdf(self, pdf_path):
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
            if len(text.split()) >= 350:
                break
        return self.truncate_text(text)

    def read_word(self, docx_path):
        doc = Document(docx_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
            if len(text.split()) >= 500:
                break
        return self.truncate_text(text)

    def read_file(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.pdf':
            return self.read_pdf(file_path)
        elif file_extension == '.docx':
            return self.read_word(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")

In [None]:
qg = QuestionGenerator()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

In [None]:
# Test the model
input_file = 'egypt.pdf'
file_text = qg.read_file(input_file)
generated_questions = qg.generate_questions(file_text)

In [None]:
for idx, question_data in enumerate(generated_questions):
    print(f"\nQuestion {idx+1}: {question_data['question']}")
    print(f"Correct Answer: {question_data['answer']}")
    print(f"Choices: {question_data['choices']}")


Question 1: question: The Nile is the world's longest what?
Correct Answer: river
Choices: ['egypt', 'egyptians', 'river', 'nile']

Question 2: question: The Nile is the world's longest what?
Correct Answer: river
Choices: ['river', 'nile', 'egyptians', 'ethiopia']

Question 3: question: Who stayed close to home?
Correct Answer: egyptians
Choices: ['river', 'egyptians', 'nile', 'egypt']

Question 4: question: What civilization was fed by the Nile River?
Correct Answer: egypt
Choices: ['ethiopia', 'nile', 'egyptians', 'egypt']

Question 5: question: Who stayed close to home?
Correct Answer: egyptians
Choices: ['egyptians', 'nile', 'ethiopia', 'river']


In [None]:
# Save the model
qg.save_model_to_bin('question_generator_model.bin')

Model and tokenizer saved to question_generator_model.bin


In [None]:
qg.save_model_to_json('question_generator_model.json')

Model and tokenizer saved to question_generator_model.json
