In [3]:
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import random
import spacy
import language_tool_python
import csv

# Preprocessing the Given PDF

In [1]:
import pdfplumber
import re
import csv

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

def extract_and_preprocess_text(file_path):
    preprocessed_texts = [] 
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(num_pages):
            page = pdf.pages[page_num]
            text = page.extract_text()
            preprocessed_text = preprocess_text(text)
            preprocessed_texts.append(preprocessed_text)

            print(f"Page {page_num + 1}: {preprocessed_text}\n")

    return preprocessed_texts 

def save_to_csv(file_path, preprocessed_texts):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Page', 'Preprocessed Text'])  # Header

        for i, text in enumerate(preprocessed_texts, start=1):
            csv_writer.writerow([i, text])

pdf_file_path = 'C:/Users/sriyo/Desktop/jets/NLP Bot/files/NCERT Biology.pdf'
csv_file_path = 'output.csv'

preprocessed_texts = extract_and_preprocess_text(pdf_file_path)
save_to_csv(csv_file_path, preprocessed_texts)


Page 1: chapter 2 human reproduction 2.1 the male reproductive system as you are aware humans are sexually reproducing and 2.2 the female reproductive viviparous. the reproductive events in humans include system formation of gametes gametogenesis i.e. sperms in males 2.3 gametogenesis and ovum in females transfer of sperms into the female 2.4 menstrual cycle genital tract insemination and fusion of male and female gametes fertilisation leading to formation of zygote. this 2.5 fertilisation and is followed by formation and development of blastocyst implantation and its attachment to the uterine wall implantation 2.6 pregnancy and embryonic embryonic development gestation and delivery of the development baby parturition. you have learnt that these reproductive events occur after puberty. there are remarkable 2.7 parturition and lactation differences between the reproductive events in the male and in the female for example sperm formation continues even in old men but formation of ovum ce

# Question Generation

In [9]:

def generate_questions_for_sentences(sentences, model, tokenizer, page_number, top_k=50, top_p=0.95):
    generated_questions = set()

    num_sentences = len(sentences)

    for i in range(num_sentences - 1):

        selected_sentences = sentences[i:i+10]
        selected_text = ' '.join(selected_sentences)

        input_text_with_prefix = f"Generate a question for the following text (Page {page_number}): {selected_text}"
        inputs = tokenizer(input_text_with_prefix, return_tensors="pt")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=50,
                top_k=top_k,
                top_p=top_p,
                temperature=1.0,
            )

        generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        split_questions = [question.strip() + '?' for question in re.split(r'[.!?]', generated_question)]
        generated_questions.update(split_questions)

    return [{'page': page_number, 'question': question} for question in generated_questions]
def save_questions_to_file(questions, output_file_path='generated_questions.txt'):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for question_entry in questions:
            question = question_entry['question']
            file.write(question + '\n')


tokenizer = AutoTokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")



all_generated_questions = []

for i, input_paragraph in enumerate(preprocessed_texts, start=1):
    sentences = re.split(r'[.!?]', input_paragraph)
    generated_questions = generate_questions_for_sentences(sentences, model, tokenizer, page_number=i, top_k=50, top_p=0.95)
    all_generated_questions.extend(generated_questions)


save_questions_to_file(all_generated_questions, output_file_path='generated_questions.txt')


print("Generated Questions:")
for question in all_generated_questions:
    print(question)

print("Generated questions have been saved to 'generated_questions.txt' and printed.")


def save_questions_to_file_csv_with_page(questions, output_file_path='generated_questions_with_page.csv'):
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Page', 'Generated Questions'])  # Header

        for entry in questions:
            csv_writer.writerow([entry['page'], entry['question']])

save_questions_to_file_csv_with_page(all_generated_questions, output_file_path='generated_questions_with_page.csv')

Generated Questions:
{'page': 1, 'question': 'What is the reproductive event in humans?'}
{'page': 1, 'question': 'What is the male reproductive system located in?'}
{'page': 1, 'question': 'What is the external genitalia rationalised 202324?'}
{'page': 1, 'question': 'What is the name?'}
{'page': 1, 'question': 'Where is the male reproductive?'}
{'page': 1, 'question': 'What is the name of the sperms in males?'}
{'page': 1, 'question': 'What is the name of the text that includes a pair of testes along with accessory ducts glands?'}
{'page': 1, 'question': 'Where is the female reproductive system located?'}
{'page': 1, 'question': 'What does sperm formation continue even in old men?'}
{'page': 1, 'question': 'What are the differences between the reproductive events in the male and female?'}
{'page': 1, 'question': 'What is the female reproductive system?'}
{'page': 1, 'question': 'What is the term for transfer of sperms into the female?'}
{'page': 1, 'question': 'Where is the male repr

In [26]:
with open("generated_questions_with_page.csv", 'r', encoding='utf-8') as file:
        generated_questions = file.readlines()

In [29]:
questions_only = [line.split(',')[1].strip() for line in generated_questions[1:]]


In [31]:
len(questions_only)

581

# Keyword Extraction

In [37]:
def extract_keywords_batch(texts, max_length=100):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    predicted_keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_keywords

batch_size = 2 
all_keywords = []

tokenizer = AutoTokenizer.from_pretrained("transformer3/H1-keywordextractor")
model = AutoModelForSeq2SeqLM.from_pretrained("transformer3/H1-keywordextractor")

for i in range(0, len(preprocessed_texts), batch_size):
    batch_texts = preprocessed_texts[i:i+batch_size]

    keywords_batch = extract_keywords_batch(batch_texts, max_length=512)

    all_keywords.extend(keywords_batch.split())

    print("Extracted Keywords (Batch {}):".format(i//batch_size + 1), keywords_batch)


print("All Extracted Keywords:", all_keywords)


Extracted Keywords (Batch 1): human reproduction, male reproductive system, female reproductive viviparous, gametes gametogenesis, sperms in males, ovum in females, menstrual cycle genital tract insemination, fertilisation, blastocyst implantation, uterine wall implantation
Extracted Keywords (Batch 2): biology,  seminiferous tubule,  penis,  male external genitalia,  foreskin,  paired seminal vesicles
Extracted Keywords (Batch 3): biology,  oviduct,  ampulla,  isthmus,  uterus,  womb,  inverted pear,  cervical canal figure 2.3b,  birth canal,  external thin membranous perimetrium middle thick layer of smooth muscle myometrium and inner glandular layer called endometrium
Extracted Keywords (Batch 4): biology secretion,  spermiogenesis,  plasma membrane,  haploid nucleus,  caplike structure acrosome,  mitochondria
Extracted Keywords (Batch 5): biology, menstrual cycle, endometrial lining, pregnancy, stress poor health, follicular phase, pituitary and ovarian hormones
Extracted Keywords 

In [38]:
all_keywords

['human',
 'reproduction,',
 'male',
 'reproductive',
 'system,',
 'female',
 'reproductive',
 'viviparous,',
 'gametes',
 'gametogenesis,',
 'sperms',
 'in',
 'males,',
 'ovum',
 'in',
 'females,',
 'menstrual',
 'cycle',
 'genital',
 'tract',
 'insemination,',
 'fertilisation,',
 'blastocyst',
 'implantation,',
 'uterine',
 'wall',
 'implantation',
 'biology,',
 'seminiferous',
 'tubule,',
 'penis,',
 'male',
 'external',
 'genitalia,',
 'foreskin,',
 'paired',
 'seminal',
 'vesicles',
 'biology,',
 'oviduct,',
 'ampulla,',
 'isthmus,',
 'uterus,',
 'womb,',
 'inverted',
 'pear,',
 'cervical',
 'canal',
 'figure',
 '2.3b,',
 'birth',
 'canal,',
 'external',
 'thin',
 'membranous',
 'perimetrium',
 'middle',
 'thick',
 'layer',
 'of',
 'smooth',
 'muscle',
 'myometrium',
 'and',
 'inner',
 'glandular',
 'layer',
 'called',
 'endometrium',
 'biology',
 'secretion,',
 'spermiogenesis,',
 'plasma',
 'membrane,',
 'haploid',
 'nucleus,',
 'caplike',
 'structure',
 'acrosome,',
 'mitochond

In [39]:
len(all_keywords)

155

In [40]:
nlp = spacy.load("en_core_web_sm")

def filter_keywords_spacy(keywords):
    doc = nlp(" ".join(keywords))


    filtered_keywords = [token.text for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text) and token.text.lower() != 'figure' and token.text.lower() != 'diagram']

    return filtered_keywords

filtered_keywords_spacy = filter_keywords_spacy(all_keywords)

print("Filtered Keywords (spaCy):", filtered_keywords_spacy)


Filtered Keywords (spaCy): ['human', 'reproduction', 'male', 'reproductive', 'system', 'female', 'reproductive', 'viviparous', 'gametes', 'gametogenesis', 'sperms', 'males', 'ovum', 'females', 'menstrual', 'cycle', 'genital', 'tract', 'insemination', 'fertilisation', 'blastocyst', 'implantation', 'uterine', 'wall', 'implantation', 'biology', 'seminiferous', 'tubule', 'penis', 'male', 'external', 'genitalia', 'foreskin', 'paired', 'seminal', 'vesicles', 'biology', 'oviduct', 'ampulla', 'isthmus', 'uterus', 'womb', 'inverted', 'pear', 'cervical', 'canal', 'birth', 'canal', 'external', 'thin', 'membranous', 'perimetrium', 'middle', 'thick', 'layer', 'smooth', 'muscle', 'myometrium', 'inner', 'glandular', 'layer', 'called', 'endometrium', 'biology', 'secretion', 'spermiogenesis', 'plasma', 'membrane', 'haploid', 'nucleus', 'caplike', 'structure', 'acrosome', 'mitochondria', 'biology', 'menstrual', 'cycle', 'endometrial', 'lining', 'pregnancy', 'stress', 'poor', 'health', 'follicular', 'pha

In [41]:
len(filtered_keywords_spacy)

145

# Grammar Checking

In [32]:
tool = language_tool_python.LanguageTool('en-US')
grammar_filtered=[]
def is_grammatically_correct(question):
    matches = tool.check(question)
    return len(matches) == 0

for question in questions_only:
    if is_grammatically_correct(question):
         grammar_filtered.append(question)



In [33]:
grammar_filtered

['What is the reproductive event in humans?',
 'What is the male reproductive system located in?',
 'What is the name?',
 'Where is the male reproductive?',
 'What is the name of the sperms in males?',
 'What is the name of the text that includes a pair of testes along with accessory ducts glands?',
 'Where is the female reproductive system located?',
 'What does sperm formation continue even in old men?',
 'What are the differences between the reproductive events in the male and female?',
 'What is the female reproductive system?',
 'What is the term for transfer of sperms into the female?',
 'Where is the male reproductive system located?',
 'What is the female reproductive viviparous?',
 'What is the female reproductive system located in?',
 '?',
 'What is the male reproductive system?',
 'What?',
 'What is the name of the sperms in females?',
 'What is the name of the genital tract insemination?',
 'What is the male reproductive viviparous?',
 'What does ovum cease to do in women a

In [36]:
len(questions_only)

581

In [35]:
len(grammar_filtered)

421

# Filtration using Keywords

In [42]:
def filter_questions_by_context(questions, context_keywords):
    filtered_questions = []
    for question in questions:
            if any(keyword in question.lower() for keyword in context_keywords):
                filtered_questions.append(question)
    return filtered_questions

filtered_questions = filter_questions_by_context(grammar_filtered, filtered_keywords_spacy)




print("Filtered Questions:")
for question in filtered_questions:
    print(question)

print(f"Filtered questions have been saved to 'filtered_questions.txt' and printed.")


Filtered Questions:
What is the reproductive event in humans?
What is the male reproductive system located in?
Where is the male reproductive?
What is the name of the sperms in males?
What is the name of the text that includes a pair of testes along with accessory ducts glands?
Where is the female reproductive system located?
What are the differences between the reproductive events in the male and female?
What is the female reproductive system?
What is the term for transfer of sperms into the female?
Where is the male reproductive system located?
What is the female reproductive viviparous?
What is the female reproductive system located in?
What is the male reproductive system?
What is the name of the sperms in females?
What is the name of the genital tract insemination?
What is the male reproductive viviparous?
What does ovum cease to do in women around fifty years?
What are interstitial spaces called?
What is the male sex accessory duct?
What is the reproductive system of males?
What 

In [46]:
filtered_questions

['What is the reproductive event in humans?',
 'What is the male reproductive system located in?',
 'Where is the male reproductive?',
 'What is the name of the sperms in males?',
 'What is the name of the text that includes a pair of testes along with accessory ducts glands?',
 'Where is the female reproductive system located?',
 'What are the differences between the reproductive events in the male and female?',
 'What is the female reproductive system?',
 'What is the term for transfer of sperms into the female?',
 'Where is the male reproductive system located?',
 'What is the female reproductive viviparous?',
 'What is the female reproductive system located in?',
 'What is the male reproductive system?',
 'What is the name of the sperms in females?',
 'What is the name of the genital tract insemination?',
 'What is the male reproductive viviparous?',
 'What does ovum cease to do in women around fifty years?',
 'What are interstitial spaces called?',
 'What is the male sex accessory

# saved them 

In [76]:
def save_questions_to_file(questions, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for question in questions:
            file.write(question)

output_file_path = 'output_questions.txt'

save_questions_to_file(filtered_questions, output_file_path)


# For assigning page numbers

In [69]:
matched_strings = []
for filtered_question in filtered_questions:
    for entry in generated_questions:
        if filtered_question.lower() in entry.lower():
            matched_strings.append(entry)

print(matched_strings)

['1,What is the reproductive event in humans?\n', '1,What is the male reproductive system located in?\n', '1,Where is the male reproductive?\n', '1,What is the name of the sperms in males?\n', '1,What is the name of the text that includes a pair of testes along with accessory ducts glands?\n', '1,Where is the female reproductive system located?\n', '1,What are the differences between the reproductive events in the male and female?\n', '1,What is the female reproductive system?\n', '1,What is the term for transfer of sperms into the female?\n', '1,Where is the male reproductive system located?\n', '1,What is the female reproductive viviparous?\n', '1,What is the female reproductive system located in?\n', '1,What is the male reproductive system?\n', '1,What is the name of the sperms in females?\n', '1,What is the name of the genital tract insemination?\n', '1,What is the male reproductive viviparous?\n', '1,What does ovum cease to do in women around fifty years?\n', '2,What are interstit

In [76]:

# New lists to store page numbers and questions
page_numbers = []
questions = []

# Check if each element in filtered_questions is present in generated_questions
for filtered_question in filtered_questions:
    for entry in generated_questions:
        if filtered_question.lower() in entry.lower():
            # Split the entry to extract page number and question
            parts = entry.split(',')
            page_numbers.append(parts[0])
            questions.append(parts[1].strip())

# Save page_numbers and questions to a CSV file
output_csv_path = 'matched_strings_output.csv'
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Page', 'Question'])  # Header

    # Write the data to the CSV file
    for page, question in zip(page_numbers, questions):
        csv_writer.writerow([page, question])

print(f"Page numbers and questions have been saved to '{output_csv_path}'.")


Page numbers and questions have been saved to 'matched_strings_output.csv'.


# Answer Generation

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df=pd.read_csv('matched_strings_output.csv')

In [3]:
df

Unnamed: 0,Page,Question
0,1,What is the reproductive event in humans?
1,1,What is the male reproductive system located in?
2,1,Where is the male reproductive?
3,1,What is the name of the sperms in males?
4,1,What is the name of the text that includes a p...
...,...,...
337,15,What type of cells are found in ovary?
338,15,What hormones are involved in induction of par...
339,15,What are the hormones involved in regulation o...
340,15,What is the name of the function of the follow...


In [4]:
contextt=pd.read_csv('output.csv')
contextt

Unnamed: 0,Page,Preprocessed Text
0,1,chapter 2 human reproduction 2.1 the male repr...
1,2,human reproduction the testes are situated out...
2,3,biology figure 2.2 diagrammatic sectional view...
3,4,human reproduction figure 2.3 a diagrammatic s...
4,5,biology part of the oviduct called ampulla. th...
5,6,human reproduction a functional mammary gland ...
6,7,biology secretion of some factors which help i...
7,8,human reproduction secondary oocyte retains bu...
8,9,biology figure 2.9 diagrammatic presentation o...
9,10,human reproduction gonadotropins lh and fsh in...


In [5]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [6]:
# Example 
context = "Mitochondria is the powerhouse of the cell"
question = "What is mitochondria?"

inputs = tokenizer(question, context, return_tensors="pt")


start_positions = torch.argmax(model(**inputs).start_logits)
end_positions = torch.argmax(model(**inputs).end_logits)


answer = tokenizer.decode(inputs["input_ids"][0][start_positions:end_positions+1])

print("Answer:", answer)



Answer:  the powerhouse of the cell


In [9]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load RoBERTa tokenizer and model
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)



# Create a dictionary to store answers
answers_dict = {'Question': [], 'Answer': []}

# Loop through questions
for index, row in df.iterrows():
    page_number = row['Page']
    question = row['Question']

    # Find corresponding context based on page number
    context = contextt[contextt['Page'] == page_number]['Preprocessed Text'].values[0]

    # Tokenize input
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)

    # Get the answer
    start_positions = torch.argmax(model(**inputs).start_logits)
    end_positions = torch.argmax(model(**inputs).end_logits)
    answer = tokenizer.decode(inputs["input_ids"][0][start_positions:end_positions + 1])

    # Store the answer in the dictionary
    answers_dict['Question'].append(question)
    answers_dict['Answer'].append(answer)

# Convert the dictionary to a dataframe
answers_df = pd.DataFrame(answers_dict)

# Save the dataframe to a CSV file
answers_df.to_csv('answers.csv', index=False)

# Display the answers dataframe
print(answers_df)


                                              Question  \
0            What is the reproductive event in humans?   
1     What is the male reproductive system located in?   
2                      Where is the male reproductive?   
3             What is the name of the sperms in males?   
4    What is the name of the text that includes a p...   
..                                                 ...   
337             What type of cells are found in ovary?   
338  What hormones are involved in induction of par...   
339  What are the hormones involved in regulation o...   
340  What is the name of the function of the follow...   
341           What are the functions of the following?   

                                                Answer  
0            system formation of gametes gametogenesis  
1                                        pelvis region  
2                                        pelvis region  
3                                gametes gametogenesis  
4                 

In [10]:
answers_df

Unnamed: 0,Question,Answer
0,What is the reproductive event in humans?,system formation of gametes gametogenesis
1,What is the male reproductive system located in?,pelvis region
2,Where is the male reproductive?,pelvis region
3,What is the name of the sperms in males?,gametes gametogenesis
4,What is the name of the text that includes a p...,<s>
...,...,...
337,What type of cells are found in ovary?,c leydig cells
338,What hormones are involved in induction of par...,<s>What hormones are involved in induction of ...
339,What are the hormones involved in regulation o...,<s>
340,What is the name of the function of the follow...,
