In [1]:
# Step 1: Install Necessary Libraries
!pip install transformers torch

data = [
    {
        "context": "The Nile River is the lifeblood of Egypt, providing water for agriculture, industry, and domestic use.",
        "question": "What is the importance of the Nile River to Egypt?",
        "answer": "lifeblood of Egypt"  # Exact match
    },
    {
        "context": "Egypt has implemented conservation projects to protect the Nile River's ecosystem and biodiversity.",
        "question": "What are some of Egypt's conservation initiatives?",
        "answer": "conservation projects" # Exact match
    }
    # Add more question-answer pairs, ensuring answers are exact substrings of the context.
]


import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer.encode_plus(
            item['question'],
            item['context'],
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Ensuring the answer is within the context
        start_idx = 0
        end_idx = 0
        if item['answer'] in item['context']:
            start_idx = item['context'].index(item['answer'])
            end_idx = start_idx + len(item['answer'])
        else:
            print(f"Answer not found in context: {item['answer']}")

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_positions': torch.tensor(start_idx),
            'end_positions': torch.tensor(end_idx)
        }

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

# Device configuration (for GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create the dataset and dataloader
dataset = QADataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Set up the optimizer and scheduler (using torch.optim.AdamW)
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * 3  # Number of epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune the model
model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        
        #Move input data to the correct device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch + 1} completed")

# Save the fine-tuned model
model.save_pretrained('fine-tuned-distilbert')
tokenizer.save_pretrained('fine-tuned-distilbert')



Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


('fine-tuned-distilbert\\tokenizer_config.json',
 'fine-tuned-distilbert\\special_tokens_map.json',
 'fine-tuned-distilbert\\vocab.txt',
 'fine-tuned-distilbert\\added_tokens.json')

In [2]:
pip install transformers torch nltk pymupdf fuzzywuzzy


Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz
import nltk

# Ensure the stopwords and wordnet are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

class Model:
    def __init__(self, model_name='fine-tuned-distilbert', max_len=512):
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertForQuestionAnswering.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_len = max_len

    def get_best_answer(self, question, context, stride=1):
        # Tokenize question and context
        question_encodings = self.tokenizer.encode_plus(
            question, add_special_tokens=True, return_tensors="pt"
        ).to('cuda' if torch.cuda.is_available() else 'cpu')
        context_tokens = self.tokenizer.tokenize(context)

        best_answer = ""
        highest_score = -float('inf')

        # Lemmatization of keywords
        lemmatizer = WordNetLemmatizer()
        expanded_keywords = set(lemmatizer.lemmatize(kw) for kw in question.lower().split())

        # Iterate over tokenized context with stride
        for i in range(0, len(context_tokens), stride):
            chunk_start = i
            chunk_end = min(i + self.max_len - len(question_encodings['input_ids'][0]) - 3, len(context_tokens))
            chunk_tokens = context_tokens[chunk_start:chunk_end]
            chunk = self.tokenizer.convert_tokens_to_string(chunk_tokens)

            inputs = self.tokenizer.encode_plus(
                question,
                chunk,
                add_special_tokens=True,
                return_tensors="pt",
                truncation="only_second",
                max_length=self.max_len,
                padding='max_length'
            ).to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = self.model(**inputs)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_indexes = torch.topk(start_logits, k=3).indices[0].tolist()
            end_indexes = torch.topk(end_logits, k=3).indices[0].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Check for a valid answer span within the context (not including the question)
                    if start_index <= end_index < len(inputs["input_ids"][0]) and start_index >= len(question_encodings["input_ids"][0]):
                        answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
                        answer = self.tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()

                        # Answer validation
                        if len(answer) < 5 or answer.startswith(".") or answer.endswith(".") or "[PAD]" in answer:
                            continue

                        # Refined scoring
                        logit_score = (start_logits[0, start_index] + end_logits[0, end_index]).item()
                        keyword_score = sum(
                            max(fuzz.ratio(keyword, answer.lower()), fuzz.partial_ratio(keyword, answer.lower())) 
                            for keyword in expanded_keywords
                        ) / 100  # Normalize to 0-1 range

                        score = logit_score + keyword_score

                        if score > highest_score:
                            highest_score = score

                            # Post-processing: Find the first keyword and start the answer from there
                            for i in range(len(answer_tokens)):
                                if self.tokenizer.decode([answer_tokens[i]]).lower() in expanded_keywords:
                                    best_answer = self.tokenizer.decode(answer_tokens[i:], skip_special_tokens=True).strip()
                                    break  # Stop after the first keyword is found
                            else: 
                                best_answer = answer

        return best_answer

# Example usage
model = Model('fine-tuned-distilbert')
context = "The Nile River is the heart of Egypt's ecosystem. It has supported human populations and agriculture for millennia. Egypt has undertaken various conservation initiatives."
question = "What are the conservation initiatives in Egypt?"

answer = model.get_best_answer(question, context)
print("Question:", question)
print("Answer:", answer)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AMR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Question: What are the conservation initiatives in Egypt?
Answer: 
