In [None]:
def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

def extract_and_preprocess_text(file_path):
    preprocessed_texts = [] 
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(num_pages):
            page = pdf.pages[page_num]
            text = page.extract_text()
            preprocessed_text = preprocess_text(text)
            preprocessed_texts.append(preprocessed_text)

            print(f"{preprocessed_text}\n")

    return preprocessed_texts 


pdf_file_path = 'C:/Users/sriyo/Desktop/jets/NLP Bot/files/NCERT Biology.pdf'
preprocessed_texts = extract_and_preprocess_text(pdf_file_path)

def generate_questions_for_sentences(sentences, model, tokenizer, top_k=50, top_p=0.95):
    generated_questions = set()

    num_sentences = len(sentences)

    for i in range(num_sentences - 1):

        selected_sentences = sentences[i:i+10]
        selected_text = ' '.join(selected_sentences)

        input_text_with_prefix = f"Generate a question for the following text: {selected_text}"
        inputs = tokenizer(input_text_with_prefix, return_tensors="pt")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=50,
                top_k=top_k,
                top_p=top_p,
                temperature=1.0,
            )

        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        split_questions = [question.strip() + '?' for question in re.split(r'[.!?]', generated_question)]
        generated_questions.update(split_questions)
    return generated_questions

def save_questions_to_file(questions, output_file_path='generated_questions.txt'):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question + '\n')

tokenizer = AutoTokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")



all_generated_questions = set()

for input_paragraph in preprocessed_texts:
    sentences = re.split(r'[.!?]', input_paragraph)
    generated_questions = generate_questions_for_sentences(sentences, model, tokenizer, top_k=50, top_p=0.95)
    all_generated_questions.update(generated_questions)


save_questions_to_file(all_generated_questions, output_file_path='generated_questions.txt')


print("Generated Questions:")
for question in all_generated_questions:
    print(question)

print("Generated questions have been saved to 'generated_questions.txt' and printed.")

def extract_keywords_batch(texts, max_length=100):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    predicted_keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_keywords

batch_size = 2 
all_keywords = []

tokenizer = AutoTokenizer.from_pretrained("transformer3/H1-keywordextractor")
model = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H1-keywordextractor")

for i in range(0, len(preprocessed_texts), batch_size):
    batch_texts = preprocessed_texts[i:i+batch_size]

    keywords_batch = extract_keywords_batch(batch_texts, max_length=512)

    all_keywords.extend(keywords_batch.split())

    print("Extracted Keywords (Batch {}):".format(i//batch_size + 1), keywords_batch)


print("All Extracted Keywords:", all_keywords)

nlp = spacy.load("en_core_web_sm")

def filter_keywords_spacy(keywords):
    doc = nlp(" ".join(keywords))


    filtered_keywords = [token.text for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text) and token.text.lower() != 'figure' and token.text.lower() != 'diagram']

    return filtered_keywords

filtered_keywords_spacy = filter_keywords_spacy(all_keywords)

print("Filtered Keywords (spaCy):", filtered_keywords_spacy)
tool = language_tool_python.LanguageTool('en-US')
grammar_filtered=[]
def is_grammatically_correct(question):
    matches = tool.check(question)
    return len(matches) == 0

for question in generated_questions:
    if is_grammatically_correct(question):
         grammar_filtered.append(question)

file_path='generated_questions.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        generated_questions = file.readlines()
def save_questions_to_file(questions, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question)

output_file_path = 'output_questions.txt'

save_questions_to_file(filtered_questions, output_file_path)



In [4]:
import pdfplumber
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import spacy
import language_tool_python

nlp = spacy.load("en_core_web_sm")
tool = language_tool_python.LanguageTool('en-US')

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

def extract_and_preprocess_text(file_path):
    preprocessed_texts = [] 
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(num_pages):
            page = pdf.pages[page_num]
            text = page.extract_text()
            preprocessed_text = preprocess_text(text)
            preprocessed_texts.append(preprocessed_text)

            print(f"{preprocessed_text}\n")

    return preprocessed_texts 

def extract_keywords_batch(texts, tokenizer, model, max_length=100):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    predicted_keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_keywords


def filter_keywords_spacy(keywords):
    doc = nlp(" ".join(keywords))
    filtered_keywords = [token.text for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text) and token.text.lower() != 'figure' and token.text.lower() != 'diagram']
    return filtered_keywords

def generate_questions_for_sentences(sentences, model, tokenizer, top_k=50, top_p=0.95):
    generated_questions = set()
    num_sentences = len(sentences)

    for i in range(num_sentences - 1):
        selected_sentences = sentences[i:i+10]
        selected_text = ' '.join(selected_sentences)

        input_text_with_prefix = f"Generate a question for the following text: {selected_text}"
        inputs = tokenizer(input_text_with_prefix, return_tensors="pt")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=50,
                top_k=top_k,
                top_p=top_p,
                temperature=1.0,
            )

        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        split_questions = [question.strip() + '?' for question in re.split(r'[.!?]', generated_question)]
        generated_questions.update(split_questions)

    return generated_questions

def is_grammatically_correct(question):
    matches = tool.check(question)
    return len(matches) == 0

def save_questions_to_file(questions, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question + '\n')

def generate_questions_from_pdf(pdf_file_path, output_file_path='output_questions.txt'):
    # Extract and preprocess text from PDF
    preprocessed_texts = extract_and_preprocess_text(pdf_file_path)

    # Initialize T5 model and tokenizer for generating questions
    tokenizer_t5 = AutoTokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
    model_t5 = AutoModelForSeq2SeqLM.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")

    # Generate questions using T5 model
    all_generated_questions = set()
    for input_paragraph in preprocessed_texts:
        sentences = re.split(r'[.!?]', input_paragraph)
        generated_questions = generate_questions_for_sentences(sentences, model_t5, tokenizer_t5, top_k=50, top_p=0.95)
        all_generated_questions.update(generated_questions)

    # Initialize tokenizer and model for keyword extraction
    tokenizer_keyword = AutoTokenizer.from_pretrained("transformer3/H1-keywordextractor")
    model_keyword = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H1-keywordextractor")

    # Extract keywords using another model
    batch_size = 2 
    all_keywords = []
    for i in range(0, len(preprocessed_texts), batch_size):
        batch_texts = preprocessed_texts[i:i+batch_size]
        keywords_batch = extract_keywords_batch(batch_texts, max_length=512, tokenizer=tokenizer_keyword, model=model_keyword)
        all_keywords.extend(keywords_batch.split())
        print("Extracted Keywords (Batch {}):".format(i//batch_size + 1), keywords_batch)

    # Filter keywords using spaCy
    filtered_keywords_spacy = filter_keywords_spacy(all_keywords)
    print("Filtered Keywords (spaCy):", filtered_keywords_spacy)

    # Filter questions based on grammar
    grammar_filtered = [question for question in all_generated_questions if is_grammatically_correct(question)]

    # Save filtered questions to a file
    save_questions_to_file(grammar_filtered, output_file_path)

    print("Filtered Questions have been saved to 'output_questions.txt' and printed.")

# Example Usage:
pdf_file_path = 'C:/Users/sriyo/Desktop/jets/NLP Bot/files/NCERT Biology.pdf'
generate_questions_from_pdf(pdf_file_path)


chapter 2 human reproduction 2.1 the male reproductive system as you are aware humans are sexually reproducing and 2.2 the female reproductive viviparous. the reproductive events in humans include system formation of gametes gametogenesis i.e. sperms in males 2.3 gametogenesis and ovum in females transfer of sperms into the female 2.4 menstrual cycle genital tract insemination and fusion of male and female gametes fertilisation leading to formation of zygote. this 2.5 fertilisation and is followed by formation and development of blastocyst implantation and its attachment to the uterine wall implantation 2.6 pregnancy and embryonic embryonic development gestation and delivery of the development baby parturition. you have learnt that these reproductive events occur after puberty. there are remarkable 2.7 parturition and lactation differences between the reproductive events in the male and in the female for example sperm formation continues even in old men but formation of ovum ceases in 

NameError: name 'tool' is not defined

In [3]:
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import random
import spacy
import language_tool_python
import csv

# Preprocessing the Given PDF

In [2]:
import pdfplumber
import re
import csv

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

def extract_and_preprocess_text(file_path):
    preprocessed_texts = [] 
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(num_pages):
            page = pdf.pages[page_num]
            text = page.extract_text()
            preprocessed_text = preprocess_text(text)
            preprocessed_texts.append(preprocessed_text)

            print(f"{preprocessed_text}\n")

    return preprocessed_texts 

def save_to_csv(file_path, preprocessed_texts):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Page', 'Preprocessed Text'])  # Header

        for i, text in enumerate(preprocessed_texts, start=1):
            csv_writer.writerow([i, text])

pdf_file_path = 'C:/Users/sriyo/Desktop/jets/NLP Bot/files/NCERT Biology.pdf'
csv_file_path = 'output.csv'

preprocessed_texts = extract_and_preprocess_text(pdf_file_path)
save_to_csv(csv_file_path, preprocessed_texts)


chapter 2 human reproduction 2.1 the male reproductive system as you are aware humans are sexually reproducing and 2.2 the female reproductive viviparous. the reproductive events in humans include system formation of gametes gametogenesis i.e. sperms in males 2.3 gametogenesis and ovum in females transfer of sperms into the female 2.4 menstrual cycle genital tract insemination and fusion of male and female gametes fertilisation leading to formation of zygote. this 2.5 fertilisation and is followed by formation and development of blastocyst implantation and its attachment to the uterine wall implantation 2.6 pregnancy and embryonic embryonic development gestation and delivery of the development baby parturition. you have learnt that these reproductive events occur after puberty. there are remarkable 2.7 parturition and lactation differences between the reproductive events in the male and in the female for example sperm formation continues even in old men but formation of ovum ceases in 

# Question Generation

In [4]:
def generate_questions_for_sentences(sentences, model, tokenizer, top_k=50, top_p=0.95):
    generated_questions = set()

    num_sentences = len(sentences)

    for i in range(num_sentences - 1):

        selected_sentences = sentences[i:i+10]
        selected_text = ' '.join(selected_sentences)

        input_text_with_prefix = f"Generate a question for the following text: {selected_text}"
        inputs = tokenizer(input_text_with_prefix, return_tensors="pt")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=50,
                top_k=top_k,
                top_p=top_p,
                temperature=1.0,
            )

        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        split_questions = [question.strip() + '?' for question in re.split(r'[.!?]', generated_question)]
        generated_questions.update(split_questions)
    return generated_questions

def save_questions_to_file(questions, output_file_path='generated_questions.txt'):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question + '\n')

tokenizer = AutoTokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")



all_generated_questions = set()

for input_paragraph in preprocessed_texts:
    sentences = re.split(r'[.!?]', input_paragraph)
    generated_questions = generate_questions_for_sentences(sentences, model, tokenizer, top_k=50, top_p=0.95)
    all_generated_questions.update(generated_questions)


save_questions_to_file(all_generated_questions, output_file_path='generated_questions.txt')


print("Generated Questions:")
for question in all_generated_questions:
    print(question)

print("Generated questions have been saved to 'generated_questions.txt' and printed.")

def save_questions_to_file_csv(questions, output_file_path='generated_questions.csv'):
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Generated Questions'])  # Header

        for question in questions:
            csv_writer.writerow([question])
            
save_questions_to_file_csv(all_generated_questions, output_file_path='generated_questions.csv')


From c:\Users\sriyo\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.



Generated Questions:
What is the inner cell mass called in human reproduction blastomeres?
What is the process called that produces sperms?
What year was the female reproductive system rationalised?
What is the uterus also called?
What is the name of the genital tract insemination and fusion of male and female gametes fertilisation leading to formation of zygote?
What is the secretions of?
What is the longest pregnancy in humans?
What is the seminal plasma rich in?
What hormone induces ovulation?
How many highly coiled seminiferous tubules are in each lobule?
What is the urethra called?
When does spermatogenesis start?
What is the female reproductive?
How long does human pregnancy last?
What type of cells are found in ovary?
What is the opening of the vagina often covered partially?
Where are truefalse c leydig cells found?
What is connected to lactiferous duct through which milk is sucked out?
What do sertoli cells provide?
What is the primary female sex organs that produce the female

# Keyword Extraction

In [77]:
def extract_keywords_batch(texts, max_length=100):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    predicted_keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_keywords

batch_size = 2 
all_keywords = []

tokenizer = AutoTokenizer.from_pretrained("transformer3/H1-keywordextractor")
model = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H1-keywordextractor")

for i in range(0, len(preprocessed_texts), batch_size):
    batch_texts = preprocessed_texts[i:i+batch_size]

    keywords_batch = extract_keywords_batch(batch_texts, max_length=512)

    all_keywords.extend(keywords_batch.split())

    print("Extracted Keywords (Batch {}):".format(i//batch_size + 1), keywords_batch)


print("All Extracted Keywords:", all_keywords)


KeyboardInterrupt: 

In [1]:
all_keywords

NameError: name 'all_keywords' is not defined

In [67]:
len(all_keywords)

155

In [68]:
nlp = spacy.load("en_core_web_sm")

def filter_keywords_spacy(keywords):
    doc = nlp(" ".join(keywords))


    filtered_keywords = [token.text for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text) and token.text.lower() != 'figure' and token.text.lower() != 'diagram']

    return filtered_keywords

filtered_keywords_spacy = filter_keywords_spacy(all_keywords)

print("Filtered Keywords (spaCy):", filtered_keywords_spacy)


Filtered Keywords (spaCy): ['human', 'reproduction', 'male', 'reproductive', 'system', 'female', 'reproductive', 'viviparous', 'gametes', 'gametogenesis', 'sperms', 'males', 'ovum', 'females', 'menstrual', 'cycle', 'genital', 'tract', 'insemination', 'fertilisation', 'blastocyst', 'implantation', 'uterine', 'wall', 'implantation', 'biology', 'seminiferous', 'tubule', 'penis', 'male', 'external', 'genitalia', 'foreskin', 'paired', 'seminal', 'vesicles', 'biology', 'oviduct', 'ampulla', 'isthmus', 'uterus', 'womb', 'inverted', 'pear', 'cervical', 'canal', 'birth', 'canal', 'external', 'thin', 'membranous', 'perimetrium', 'middle', 'thick', 'layer', 'smooth', 'muscle', 'myometrium', 'inner', 'glandular', 'layer', 'called', 'endometrium', 'biology', 'secretion', 'spermiogenesis', 'plasma', 'membrane', 'haploid', 'nucleus', 'caplike', 'structure', 'acrosome', 'mitochondria', 'biology', 'menstrual', 'cycle', 'endometrial', 'lining', 'pregnancy', 'stress', 'poor', 'health', 'follicular', 'pha

In [69]:
len(filtered_keywords_spacy)

145

# Grammar Checking

In [70]:
tool = language_tool_python.LanguageTool('en-US')
grammar_filtered=[]
def is_grammatically_correct(question):
    matches = tool.check(question)
    return len(matches) == 0

for question in generated_questions:
    if is_grammatically_correct(question):
         grammar_filtered.append(question)

file_path='generated_questions.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        generated_questions = file.readlines()


In [71]:
grammar_filtered

['What percentage of sperms carry the x chromosome?\n',
 'What is the process called that produces sperms?\n',
 'When are the body is?\n',
 'How many days is the average interval of menstruation in human females?\n',
 'What can break the hymen?\n',
 'What are the compartments inside the testis called?\n',
 'What is the term for the cycle that ends around 50 years of age?\n',
 'What is the female reproductive?\n',
 'What is the process called that releases the secondary oocyte ovum from the ovary?\n',
 'What is the male sex accessory duct?\n',
 'What is the step of implantation that leads to pregnancy?\n',
 'What are the functions of male sex accessory ducts and glands maintained by?\n',
 'What are the hormones involved in regulation of spermatogenesis?\n',
 'What are the functions of the following?\n',
 'What is the width of the testis in adults?\n',
 'What is the structure of a sperm?\n',
 'What are the male sex accessory ducts?\n',
 'What is the name of the newborn baby fed by the mo

In [72]:
len(generated_questions)

531

In [73]:
len(grammar_filtered)

371

# Filtration using Keywords

In [74]:
def filter_questions_by_context(questions, context_keywords):
    filtered_questions = []
    for question in questions:
            if any(keyword in question.lower() for keyword in context_keywords):
                filtered_questions.append(question)
    return filtered_questions

filtered_questions = filter_questions_by_context(grammar_filtered, filtered_keywords_spacy)




print("Filtered Questions:")
for question in filtered_questions:
    print(question)

print(f"Filtered questions have been saved to 'filtered_questions.txt' and printed.")


Filtered Questions:
What percentage of sperms carry the x chromosome?

What is the process called that produces sperms?

When are the body is?

How many days is the average interval of menstruation in human females?

What are the compartments inside the testis called?

What is the term for the cycle that ends around 50 years of age?

What is the female reproductive?

What is the process called that releases the secondary oocyte ovum from the ovary?

What is the male sex accessory duct?

What is the step of implantation that leads to pregnancy?

What are the functions of male sex accessory ducts and glands maintained by?

What are the hormones involved in regulation of spermatogenesis?

What are the functions of the following?

What is the structure of a sperm?

What are the male sex accessory ducts?

What is often torn during the first coitus intercourse?

What is the primary follicle called?

What is the male external genitalia called?

What is the reproductive cycle in the female pri

In [75]:
len(filtered_questions)

308

# saved them 

In [76]:
def save_questions_to_file(questions, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question)

output_file_path = 'output_questions.txt'

save_questions_to_file(filtered_questions, output_file_path)
