# Web Scraping for Question-Answer Extraction

This notebook demonstrates how to scrape web pages for question-answer pairs, excluding specific sections and classes using Python. It utilizes `requests` and `BeautifulSoup` libraries for HTTP requests and HTML parsing.

The code performs the following steps:

1. **Fetch URLs from Sitemap**:
   - Requests the sitemap URL to get a list of URLs to scrape.

2. **Extract Question-Answer Pairs**:
   - Defines a function to extract question-answer pairs from the HTML content.
   - Removes specific sections and elements that should be excluded based on their class names.
   - Collects questions and their corresponding answers from the page.

3. **Process Each URL**:
   - Iterates through the list of URLs.
   - Requests each URL and processes the HTML content.
   - Saves the extracted question-answer pairs to text files.


In [None]:
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import re

# 1. Fetch URLs from Sitemap
sitemap_url = 'https://www.telekom.de/ueber-das-unternehmen/robots/sitemap'
print(f"Requesting sitemap URL: {sitemap_url}")
response = requests.get(sitemap_url)

# Parse XML sitemap
soup = BeautifulSoup(response.content, 'xml')
print("Processing sitemap XML...")

# Extract URLs
urls = [url.text for url in soup.find_all('loc') if url is not None]
print(f"{len(urls)} URLs found.")

def extract_question_answer(soup):
    question_answer_pairs = []
    
    # Function to get text from element
    def get_text_from_element(element):
        text = ''
        for p in element.find_all('p'):
            text += p.get_text(strip=True) + '\n'
        for ul in element.find_all('ul'):
            for li in ul.find_all('li'):
                text += f"• {li.get_text(strip=True)}\n"
        return text.strip()

    # Define excluded classes
    excluded_classes = [
        "chf-navigation-bar",
        "direct-access-container",
        "direct-access-content",
        "collection-wrapper collection collection-standard",
        "collection-wrapper collection collection-standard l-outer l-outer--solutionPage"
    ]
    
    # Function to remove excluded elements
    def remove_excluded_elements(soup):
        for class_name in excluded_classes:
            for element in soup.find_all(class_=class_name):
                if element:
                    element.decompose()
    
    # Function to remove nested excluded elements
    def remove_nested_excluded_elements(soup):
        for element in soup.find_all(True):
            if isinstance(element, BeautifulSoup):
                classes = element.get('class', [])
                if classes and any(cls in ' '.join(classes) for cls in excluded_classes):
                    element.decompose()
    
    remove_excluded_elements(soup)
    remove_nested_excluded_elements(soup)

    # Extract questions and answers
    questions = soup.find_all(['h1', 'h2', 'h3'])
    
    for question in questions:
        question_text = question.get_text(strip=True)
        if question_text.endswith('?'):
            answer_text = ''
            # Check for answer in next div
            next_div = question.find_next('div', class_='outerRichtextDiv')
            if next_div:
                answer_text = get_text_from_element(next_div)
            if not answer_text:
                next_div = question.find_next('div')
                if next_div and not any(cls in ' '.join(next_div.get('class', [])) for cls in excluded_classes):
                    answer_text = get_text_from_element(next_div)
            if answer_text:
                question_answer_pairs.append({'question': question_text, 'answer': answer_text})

    return question_answer_pairs

# 3. Process each URL
output_dir = Path("data")
output_dir.mkdir(parents=True, exist_ok=True)  # Create directory

for idx, url in enumerate(urls, 1):
    print(f"Processing URL {idx}/{len(urls)}: {url}")
    
    try:
        response = requests.get(url, allow_redirects=False)
        
        if response.status_code == 301 or response.status_code == 302:
            print("   Redirect detected, checking URL.")
            final_url = response.headers.get('Location')
            if final_url:
                response = requests.get(final_url)
        elif response.status_code == 200:
            response = requests.get(url)
        else:
            print("   Invalid URL or access problem.")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        qa_pairs = extract_question_answer(soup)
        print(f"   {len(qa_pairs)} question-answer pairs found.")
        
        if qa_pairs:
            file_name = re.sub(r'\W+', '_', url) + ".txt"
            output_file = output_dir / file_name
            
            with open(output_file, "w", encoding="utf-8") as file:
                file.write(f"Source URL: {url}\n\n")
                for idx, qa in enumerate(qa_pairs, 1):
                    file.write(f"{idx}. Question: {qa['question']}\n   Answer: {qa['answer']}\n\n")
            
            print(f"   Results saved to '{output_file}'.")
        else:
            print("   No question-answer pairs found, file will not be created.")
    
    except requests.exceptions.RequestException as e:
        print(f"   Error occurred: {e}")

print("Processing completed! Results for pages with question-answer pairs saved in 'data' directory.")
