In [None]:
import pandas as pd
import re
from typing import List
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline
import torch

# Function to load and clean the data
def load_and_clean_data(df: pd.DataFrame) -> List[str]:
    cleaned_texts = []
    for index, row in df.iterrows():
        article_body = row['articleBody']
        cleaned_text = re.sub(r'\n|\s+', ' ', article_body)
        cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
        cleaned_texts.append(cleaned_text)
    return cleaned_texts

# Function to refine the context
def refine_context(df: pd.DataFrame, keyword: str) -> List[str]:
    relevant_texts = []
    for index, row in df.iterrows():
        article_body = row['articleBody']
        if keyword.lower() in article_body.lower():
            cleaned_text = re.sub(r'\n|\s+', ' ', article_body)
            cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
            relevant_texts.append(cleaned_text)
    return relevant_texts

# Function to further refine the context
def further_refine_context(context: List[str], keyword: str) -> List[str]:
    refined_texts = [text for text in context if keyword.lower() in text.lower()]
    return refined_texts

# Function to answer the question
def answer_question_bert(question: str, context: List[str]) -> str:
    model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForQuestionAnswering.from_pretrained(model_name)
    inputs = tokenizer(question, ' '.join(context[:5]), return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    return answer

# Load and clean the data
file_path = 'news.article.json'
df = pd.read_json(file_path)
context = load_and_clean_data(df)

# Refine the context to include only articles mentioning "Al-Shifa Hospital"
refined_context = refine_context(df, "Al-Shifa Hospital")

# Further refine the context to include only texts mentioning "Al-Shifa Hospital"
further_refined_context = further_refine_context(refined_context, "Al-Shifa Hospital")

# Summarize the relevant articles
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
summarized_context = summarizer(further_refined_context, max_length=512, min_length=30, do_sample=False)

# Extract the summarized text
summarized_texts = [summary['summary_text'] for summary in summarized_context]

# Redefine the question
question = "What happened at the Al-Shifa Hospital?"

# Find the answer again with the summarized context
answer = answer_question_bert(question, summarized_texts)

# Print the answer
print(answer)