In [1]:
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [2]:
def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

def extract_and_preprocess_text(file_path):
    preprocessed_texts = [] 
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(num_pages):
            page = pdf.pages[page_num]
            text = page.extract_text()
            preprocessed_text = preprocess_text(text)
            preprocessed_texts.append(preprocessed_text)

            print(f"{preprocessed_text}\n")

    return preprocessed_texts 


pdf_file_path = 'C:/Users/sriyo/Desktop/jets/NLP Bot/files/NCERT Biology.pdf'
preprocessed_texts = extract_and_preprocess_text(pdf_file_path)



chapter 2 human reproduction 2.1 the male reproductive system as you are aware humans are sexually reproducing and 2.2 the female reproductive viviparous. the reproductive events in humans include system formation of gametes gametogenesis i.e. sperms in males 2.3 gametogenesis and ovum in females transfer of sperms into the female 2.4 menstrual cycle genital tract insemination and fusion of male and female gametes fertilisation leading to formation of zygote. this 2.5 fertilisation and is followed by formation and development of blastocyst implantation and its attachment to the uterine wall implantation 2.6 pregnancy and embryonic embryonic development gestation and delivery of the development baby parturition. you have learnt that these reproductive events occur after puberty. there are remarkable 2.7 parturition and lactation differences between the reproductive events in the male and in the female for example sperm formation continues even in old men but formation of ovum ceases in 

In [8]:
def generate_questions_for_paragraphs(paragraphs, model, tokenizer, top_k=50, top_p=0.95):
    generated_questions = set()

    for paragraph in paragraphs:
        for _ in range(10):
            input_text_with_prefix = f"Generate a question for the following text: {paragraph}"
            inputs = tokenizer(input_text_with_prefix, return_tensors="pt")

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_length=50,
                    top_k=top_k,
                    top_p=top_p,
                    temperature=1.0,
                )

            generated_question = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            split_questions = [question.strip() + '?' for question in re.split(r'[.!?]', generated_question[0])]
            generated_questions.update(split_questions)
    return generated_questions


def save_questions_to_file(questions,output_file_path='generated_questions.txt'):
    with open(output_file_path,'w',encoding='utf-8') as file:
        for question in questions:
            file.write(question+'\n')

tokenizer=AutoTokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
model=AutoModelForSeq2SeqLM.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
all_generated_questions=generate_questions_for_paragraphs(preprocessed_texts,model,tokenizer,top_k=50,top_p=0.95)
save_questions_to_file(all_generated_questions, output_file_path='generated_questions.txt')
print("Generated Questions:")
for question in all_generated_questions:
    print(question)
print(f"Generated questions have been saved to 'generated_questions.txt' and printed.")

Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors


In [None]:
all_generated_questions

{'9?',
 '?',
 'How long is each testis in adults?',
 'How many chromosomes are in each spermatogonium?',
 'How many compartments are in each testis?',
 'How many mammary lobes are there?',
 'What are fimbriae?',
 'What hormones are lh and fsh?',
 'What induces the completion of the meiotic division of the secondary oocyte?',
 'What is characteristic of all female mammals?',
 'What is the cavity of the cervix called?',
 'What is the enlarged end of penis called?',
 'What is the female reproductive system located in?',
 'What is the female reproductive system?',
 'What is the first sign of growing foetus noticed by listening to the heart sound carefully through the steth?',
 'What is the fusion?',
 'What is the hormone secreted?',
 'What is the implantation of the embryo called?',
 'What is the inner cell mass called?',
 'What is the inner layer called?',
 'What is the last part of the oviduct called?',
 'What is the main event of the menstrual cycle shown in figure 2?',
 'What is the ma

# Task is to get important keywords

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("transformer3/H1-keywordextractor")
model = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H1-keywordextractor")

def extract_keywords(text, max_length=100):

    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)

    predicted_keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return predicted_keywords

keywords = extract_keywords(preprocessed_texts, max_length=150)

print("Extracted Keywords:", keywords)

In [17]:
import language_tool_python


tool = language_tool_python.LanguageTool('en-US')

def is_grammatically_correct(question):
    matches = tool.check(question)
    return len(matches) == 0

def filter_questions_by_context(questions, context_keywords):
    filtered_questions = []
    for question in questions:
        if is_grammatically_correct(question):
            if any(keyword in question.lower() for keyword in context_keywords):
                filtered_questions.append(question)
    return filtered_questions


context_keywords = ['ovum', 'follicle', 'testis', 'oviduct', 'menstrual', 'female reproductive system']


filtered_questions = filter_questions_by_context(all_generated_questions, context_keywords)


save_questions_to_file(filtered_questions, output_file_path='filtered_questions.txt')


print("Filtered Questions:")
for question in filtered_questions:
    print(question)

print(f"Filtered questions have been saved to 'filtered_questions.txt' and printed.")


Keywords extracted from the context: ['the age of fifty years', 'three', 'two', 'spermatogonia', 'sertoli', 'vasa efferentia', 'the vasa efferentia', 'the vasa efferentia', 'meatus', 'stroma', 'stroma', 'two', 'fallopian', 'three', 'labia majora labia', 'labia', 'the labia majora', 'two', 'labia', 'first', 'alveoli', 'i.e', 'spermatogonia', 'first', 'two', 'second', 'four', 'sertoli', 'two', 'sertoli', 'us', 'a couple of million gamete', 'theca', 'tertiary', 'theca', 'theca interna', 'tertiary', 'first', 'first', 'first', 'first', 'tertiary', 'graafian follicle', 'secondary', 'first', 'one', 'one', 'vagina', 'menarche', 'fallopian', 'second', 'second', 'three', 'various months', 'months', 'one month', 'first', 'the end of the second month', 'first', 'the fifth month', 'second', 'the end of nine months', 'the initial few days', 'three', 'spermatogonia', 'sertoli', 'spermatogonia', 'stroma', 'three', 'labia majora labia', 'clitoris', 'one', 'only one', 'ovarian hormones', 'nine months', 