In [1]:
# Step 1: Install Necessary Libraries
# Install the required libraries for PDF extraction, transformers, and PyTorch
!pip install PyMuPDF transformers torch




In [2]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Step 2: Extract Text from the PDF
import fitz  # PyMuPDF is imported as fitz

# Path to the PDF file
pdf_path = 'Environmental Factors.pdf'

# Open the PDF file using PyMuPDF
pdf_document = fitz.open(pdf_path)

# Initialize an empty string to store the extracted text
pdf_text = ""
# Loop through each page in the PDF document
for page_num in range(pdf_document.page_count):
    page = pdf_document.load_page(page_num)  # Load the current page
    pdf_text += page.get_text()  # Append the text of the current page to pdf_text

# Print the first 2000 characters of the extracted text for verification
print(pdf_text[:2000])  # Printing only the first 2000 characters for brevity


Environmental Factors and Pollution in Egypt:  
Egypt, a land of towering pyramids and ancient wonders, boasts a unique environment 
shaped by a complex interplay of living (biotic) and nonliving (abiotic) factors. 
Understanding these factors is crucial for appreciating the delicate ecological balance of 
Egypt and the challenges it faces. 
A Land of Contrasts: Biotic Abundance and Aridity 
Despite the harsh desert climate, Egypt supports a diverse range of plant and animal 
life. Acacia trees with their water-conserving adaptations thrive in the desert sands, 
while along the life-giving Nile River, papyrus reeds and date palms flourish, providing 
food and shelter for a variety of animals. These vibrant ecosystems demonstrate the 
remarkable resilience of life in Egypt. 
The Nile River: A Lifeline for Egypt 
The Nile River is the heart of Egypt's ecosystem, a vital artery that has sustained human 
populations and agriculture for millennia. Its annual floods once deposited fertile si

In [4]:
# Step 3: Load and Prepare the Pre-trained QA Model
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the Model class to encapsulate the QA model and its functionality
class Model:
    def __init__(self, model_name='distilbert-base-uncased-distilled-squad'):
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)  # Load the tokenizer
        self.model = DistilBertForQuestionAnswering.from_pretrained(model_name).to(device)  # Load the model and move it to the device

    def get_best_answer(self, question, context, max_len=512):
        # Encode the question and context as inputs for the model
        inputs = self.tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt").to(device)
        input_ids = inputs["input_ids"].tolist()[0]

        if len(input_ids) > max_len:
            print(f"Input sequence is too long: {len(input_ids)} tokens. Splitting context into smaller chunks.")
            # Split the context into overlapping chunks if it exceeds max length
            chunk_size = max_len - len(self.tokenizer.encode(question, add_special_tokens=False)) - 3
            overlap = 50  # Define overlap between chunks
            chunks = [context[i:i+chunk_size] for i in range(0, len(context), chunk_size - overlap)]
            
            best_answer = ""
            highest_score = float('-inf')
            
            for chunk in chunks:
                # Encode each chunk along with the question
                inputs = self.tokenizer.encode_plus(question, chunk, add_special_tokens=True, return_tensors="pt").to(device)
                input_ids = inputs["input_ids"].tolist()[0]
                
                # Get model outputs for the current chunk
                outputs = self.model(**inputs)
                answer_start_scores = outputs.start_logits
                answer_end_scores = outputs.end_logits

                # Get the most likely beginning and end of the answer
                answer_start = torch.argmax(answer_start_scores)
                answer_end = torch.argmax(answer_end_scores) + 1

                # Calculate the confidence score
                confidence_score = torch.max(answer_start_scores) + torch.max(answer_end_scores)

                # Convert tokens to string to get the answer text
                answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

                # Update the best answer based on the highest confidence score
                if confidence_score > highest_score:
                    highest_score = confidence_score
                    best_answer = answer
            
            return best_answer
        else:
            # If the input length is within the limit, get model outputs directly
            outputs = self.model(**inputs)
            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

            # Get the most likely beginning and end of the answer
            answer_start = torch.argmax(answer_start_scores)
            answer_end = torch.argmax(answer_end_scores) + 1

            # Convert tokens to string to get the answer text
            answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            return answer

    def predict(self, context, question):
        # Method to get the best answer for a given question and context
        return {"answer": self.get_best_answer(question, context)}


In [5]:
# Step 4: Create an instance of the Model class
model = Model()

# Step 5: Interactively ask questions and get answers
while True:
    question = input("Please enter your question (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    answer = model.predict(pdf_text, question)  # Get the answer for the input question
    print("Question: " + question)
    print("Answer: " + answer["answer"] + "\n")  # Print the question and answer


Please enter your question (or type 'exit' to quit): What is the length of the Nile?


Token indices sequence length is longer than the specified maximum sequence length for this model (1280 > 512). Running this sequence through the model will result in indexing errors


Input sequence is too long: 1280 tokens. Splitting context into smaller chunks.
Question: What is the length of the Nile?
Answer: 6 , 650 kilometers

Please enter your question (or type 'exit' to quit): where is it located
Input sequence is too long: 1276 tokens. Splitting context into smaller chunks.
Question: where is it located
Answer: the red sea

Please enter your question (or type 'exit' to quit): exit


In [8]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 6: Prepare the validation dataset
validation_data = [
    {"question": "What is the length of the Nile?", "context": "The Nile is the longest river in the world, stretching for 6,650 kilometers.", "answer": "6,650 kilometers"},
    {"question": "What is the discharge of the Nile?", "context": "The Nile has an average discharge rate of 2,830 cubic meters per second.", "answer": "2,830 cubic meters per second"},
    {"question": "Where does the Nile originate?", "context": "The Nile originates from Lake Victoria in Uganda.", "answer": "Lake Victoria"},
    {"question": "What are the major tributaries of the Nile?", "context": "The two major tributaries of the Nile are the White Nile and the Blue Nile.", "answer": "White Nile and Blue Nile"},
    {"question": "What countries does the Nile flow through?", "context": "The Nile flows through multiple countries, including Uganda, Sudan, and Egypt.", "answer": "Uganda, Sudan, and Egypt"},
    {"question": "What is the length of the Amazon River?", "context": "The Amazon River is approximately 7,000 kilometers long.", "answer": "7,000 kilometers"},
    {"question": "What is the volume of water discharged by the Amazon River?", "context": "The Amazon River discharges around 209,000 cubic meters of water per second.", "answer": "209,000 cubic meters per second"},
    {"question": "What is the longest river in the United States?", "context": "The Missouri River is the longest river in the United States.", "answer": "Missouri River"},
    {"question": "What is the length of the Mississippi River?", "context": "The Mississippi River is approximately 3,730 kilometers long.", "answer": "3,730 kilometers"},
    {"question": "Where does the Mississippi River end?", "context": "The Mississippi River ends in the Gulf of Mexico.", "answer": "Gulf of Mexico"}
]
# Step 7: Predict answers for the validation dataset
true_answers = []
predicted_answers = []

for data in validation_data:
    question = data['question']
    context = data['context']
    true_answer = data['answer']
    predicted_answer = model.predict(context, question)["answer"]
    
    true_answers.append(true_answer)
    predicted_answers.append(predicted_answer)

    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}\n")

# Step 8: Compute evaluation metrics
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to normalize answers
def normalize_answer(s):
    """Lowercase, remove punctuation, and extra whitespace."""
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(\d)\s*,\s*(\d)', r'\1,\2', s)  # Remove spaces around commas in numbers
    s = re.sub(r'[^a-z0-9\s,]', '', s)
    return s

# Normalize true and predicted answers
true_answers_normalized = [normalize_answer(answer) for answer in true_answers]
predicted_answers_normalized = [normalize_answer(answer) for answer in predicted_answers]

# Calculate accuracy (exact match after normalization)
accuracy = accuracy_score(true_answers_normalized, predicted_answers_normalized)

# Calculate precision, recall, and F1 score
precision = precision_score(true_answers_normalized, predicted_answers_normalized, average='micro', zero_division=0)
recall = recall_score(true_answers_normalized, predicted_answers_normalized, average='micro', zero_division=0)
f1 = f1_score(true_answers_normalized, predicted_answers_normalized, average='micro', zero_division=0)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Question: What is the length of the Nile?
True Answer: 6,650 kilometers
Predicted Answer: 6 , 650 kilometers

Question: What is the discharge of the Nile?
True Answer: 2,830 cubic meters per second
Predicted Answer: 2 , 830 cubic meters per second

Question: Where does the Nile originate?
True Answer: Lake Victoria
Predicted Answer: lake victoria in uganda

Question: What are the major tributaries of the Nile?
True Answer: White Nile and Blue Nile
Predicted Answer: white nile and the blue nile

Question: What countries does the Nile flow through?
True Answer: Uganda, Sudan, and Egypt
Predicted Answer: uganda , sudan , and egypt

Question: What is the length of the Amazon River?
True Answer: 7,000 kilometers
Predicted Answer: 7 , 000 kilometers

Question: What is the volume of water discharged by the Amazon River?
True Answer: 209,000 cubic meters per second
Predicted Answer: 209 , 000 cubic meters

Question: What is the longest river in the United States?
True Answer: Missouri River
Pr