# Web Scraping for Question-Answer Extraction

This notebook demonstrates how to scrape web pages for question-answer pairs, excluding specific sections and classes using Python. It utilizes `requests` and `BeautifulSoup` libraries for HTTP requests and HTML parsing.

The code performs the following steps:

1. **Fetch URLs from Sitemap**:
   - Requests the sitemap URL to get a list of URLs to scrape.

2. **Extract Question-Answer Pairs**:
   - Defines a function to extract question-answer pairs from the HTML content.
   - Removes specific sections and elements that should be excluded based on their class names.
   - Collects questions and their corresponding answers from the page.

3. **Process Each URL**:
   - Iterates through the list of URLs.
   - Requests each URL and processes the HTML content.
   - Saves the extracted question-answer pairs to text files.


## Soru-cevap cekme
### Soru cevap iceren her bir web sayfasindan bunlari ve url'leri cekip ayri ayri txt dosyasina yaziyor.

In [None]:
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import re
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to create a session with retry logic
def create_session():
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
    return session

# 1. Fetch URLs from Sitemap
sitemap_url = 'https://www.telekom.de/ueber-das-unternehmen/robots/sitemap'
logger.info(f"Requesting sitemap URL: {sitemap_url}")
session = create_session()

try:
    response = session.get(sitemap_url)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    logger.error(f"Error fetching sitemap: {e}")
    exit(1)

# Parse XML sitemap
soup = BeautifulSoup(response.content, 'xml')
logger.info("Processing sitemap XML...")

# Extract URLs
urls = [url.text for url in soup.find_all('loc') if url is not None]
logger.info(f"{len(urls)} URLs found.")

# Function to extract question-answer pairs from the page
def extract_question_answer(soup):
    question_answer_pairs = []

    def get_text_from_element(element):
        text = ''
        for p in element.find_all('p'):
            text += p.get_text(strip=True) + '\n'
        for ul in element.find_all('ul'):
            for li in ul.find_all('li'):
                text += f"• {li.get_text(strip=True)}\n"
        return text.strip()

    def extract_accordion_items(accordion_list):
        for item in accordion_list.find_all('li', class_='accordion-item'):
            question = item.find('p', class_='accordion-item__title')
            answer = item.find('div', class_='accordion-item__content')
            if question and answer:
                question_text = question.get_text(strip=True)
                answer_text = get_text_from_element(answer)
                question_answer_pairs.append({'question': question_text, 'answer': answer_text})

    excluded_classes = [
        "chf-navigation-bar",
        "direct-access-container",
        "direct-access-content",
        "collection-wrapper collection collection-standard",
        "collection-wrapper collection collection-standard l-outer l-outer--solutionPage"
    ]
    
    def remove_excluded_elements(soup):
        for class_name in excluded_classes:
            for element in soup.find_all(class_=class_name):
                if element:
                    element.decompose()

    def remove_nested_excluded_elements(soup):
        for element in soup.find_all(True):
            if isinstance(element, BeautifulSoup):
                classes = element.get('class', [])
                if classes and any(cls in ' '.join(classes) for cls in excluded_classes):
                    element.decompose()
    
    remove_excluded_elements(soup)
    remove_nested_excluded_elements(soup)

    questions = soup.find_all(['h1', 'h2', 'h3'])
    for question in questions:
        question_text = question.get_text(strip=True)
        if question_text.endswith('?'):
            answer_text = ''
            next_div = question.find_next('div', class_='outerRichtextDiv')
            if next_div:
                answer_text = get_text_from_element(next_div)
            if not answer_text:
                next_div = question.find_next('div')
                if next_div and not any(cls in ' '.join(next_div.get('class', [])) for cls in excluded_classes):
                    answer_text = get_text_from_element(next_div)
            if answer_text:
                question_answer_pairs.append({'question': question_text, 'answer': answer_text})

    accordion_list = soup.find('ul', class_='accordion-list')
    if accordion_list:
        extract_accordion_items(accordion_list)

    return question_answer_pairs

# 3. Process each URL
output_dir = Path("data")
output_dir.mkdir(parents=True, exist_ok=True)

for idx, url in enumerate(urls, 1):
    logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
    
    try:
        response = session.get(url, allow_redirects=False)
        
        if response.status_code in {301, 302}:
            logger.info("   Redirect detected, checking URL.")
            final_url = response.headers.get('Location')
            if final_url:
                response = session.get(final_url)
        elif response.status_code != 200:
            logger.warning("   Invalid URL or access problem.")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        qa_pairs = extract_question_answer(soup)
        logger.info(f"   {len(qa_pairs)} question-answer pairs found.")
        
        if qa_pairs:
            file_name = re.sub(r'\W+', '_', url) + ".txt"
            output_file = output_dir / file_name
            
            try:
                with open(output_file, "w", encoding="utf-8") as file:
                    file.write(f"Source URL: {url}\n\n")
                    for qa in qa_pairs:
                        file.write(f"Question: {qa['question']}\nAnswer: {qa['answer']}\n\n")
                
                logger.info(f"   Results saved to '{output_file}'.")
            except IOError as e:
                logger.error(f"   Error writing to file: {e}")
        else:
            logger.info("   No question-answer pairs found, file will not be created.")
    
    except requests.exceptions.RequestException as e:
        logger.error(f"   Error occurred: {e}")
    
    time.sleep(1)  # Sleep to avoid overwhelming the server

logger.info("Processing completed! Results for pages with question-answer pairs saved in 'data' directory.")

## Navigation.txt olusturma
### output_folder'dan tek tek txt dosyalarindan
### navigation bilgisini cekip navigation.txt dosyasina liste olarak ekliyor.

In [None]:
import re
import os
import logging

# Setup logging
logging.basicConfig(filename='processing.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define fixed paths for the input directory and output file
directory_path = "web_data/scraped_data"
web_data_directory = "web_data"
navigation_file = os.path.join(web_data_directory, "navigation.txt")

def format_navigation_text(text):
    """
    Formats the navigation text by replacing newlines and multiple spaces with ' > '.
    Special cases:
    - Replaces ' > & >' with '&'
    - Replaces ' > und >' with ' und '
    - Replaces ' > bei >' with ' bei '
    - Ensures no extra '>' signs after the third '>'
    - Ensures no extra spaces are present
    
    Parameters:
    - text (str): The text to be formatted.
    
    Returns:
    - str: The formatted text with ' > ' as separators, with special case adjustments.
    """
    # Replace newlines and multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Replace single spaces with ' > ' for navigation format
    formatted_text = re.sub(r'\s* \s*', ' > ', text)
    
    # Handle special cases
    formatted_text = re.sub(r'>\s*&\s*>', ' & ', formatted_text)
    formatted_text = re.sub(r'>\s*und\s*>', ' und ', formatted_text)
    formatted_text = re.sub(r'>\s*bei\s*>', ' bei ', formatted_text)
    
    # Ensure no extra '>' signs after the third '>'
    formatted_text = re.sub(r'(\s*>\s*){3,}', ' > ', formatted_text)
    
    # Cleanup spaces around '>'
    formatted_text = re.sub(r'\s*>\s*', ' > ', formatted_text).strip()
    
    # Ensure no double spaces in the final text
    formatted_text = re.sub(r' {2,}', ' ', formatted_text)
    
    return formatted_text

def extract_navigation_data(directory_path, navigation_file):
    """
    Extracts navigation data from .txt files in the specified directory and writes it to an output file.
    
    Parameters:
    - directory_path (str): Path to the directory containing .txt files.
    - navigation_file (str): Path to the output file for navigation data.
    
    Outputs:
    - Writes navigation data to navigation_file.
    """

    # Regex pattern to find the section starting with "...Telekom" and ending with two spaces
    pattern = r"\.\.\.Telekom.*?\s{2}"
    
    # Initialize lists to keep track of processed and unprocessed files
    processed_files_data = []
    unprocessed_files = []

    def process_file(filename):
        """
        Processes each .txt file to extract navigation content based on patterns.
        
        Parameters:
        - filename (str): The name of the file to be processed.
        """
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as source:
                content = source.read()
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    matched_text = match.group(0)
                    start_index = match.end()

                    # Clean and format the matched text
                    cleaned_text = re.sub(r'\.\.\.+', '\n', matched_text.strip(".").strip())
                    
                    # Extract the text after the matched pattern
                    post_pattern_text = content[start_index:]
                    
                    # Find paragraphs separated by multiple newlines
                    paragraph_pattern = r'([^\n]+(?:\n[^\n]+)*)(?:\n{2,})'
                    paragraphs = re.findall(paragraph_pattern, post_pattern_text)
                    
                    if len(paragraphs) >= 1:
                        # Format navigation text
                        navigation = format_navigation_text(cleaned_text)
                        processed_files_data.append((filename, navigation))
                    else:
                        unprocessed_files.append(filename)
                else:
                    unprocessed_files.append(filename)
        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")
            unprocessed_files.append(filename)

    # Ensure the directory exists
    if not os.path.exists(directory_path):
        logging.error(f"Directory {directory_path} does not exist.")
        return

    # Create web_data directory if it does not exist
    os.makedirs(web_data_directory, exist_ok=True)

    # Process each .txt file in the directory
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith(".txt"):
            process_file(filename)

    # Write navigation data to the navigation_file
    with open(navigation_file, "w", encoding="utf-8") as nav_file:
        for filename, navigation in sorted(processed_files_data, key=lambda x: x[0]):
            # Prepend "https_" to the filename
            nav_file.write(f"https_{filename}\n")
            nav_file.write(f"{navigation}\n")
            nav_file.write("\n" + "="*40 + "\n")

    # Print summary of the processing
    total_files = len(processed_files_data) + len(unprocessed_files)
    num_processed_files = len(processed_files_data)
    num_unprocessed_files = len(unprocessed_files)
    missing_files = total_files - (num_processed_files + num_unprocessed_files)

    print("Process completed.")
    print(f"Total number of .txt files in the folder: {total_files}")
    print(f"Number of .txt files processed and written to {navigation_file}: {num_processed_files}")
    print(f"Unprocessed files: {num_unprocessed_files}")
    print(f"Number of missing or unaccounted files: {missing_files}")

    # Log the results
    logging.info(f"Total files: {total_files}")
    logging.info(f"Processed files: {num_processed_files}")
    logging.info(f"Unprocessed files: {num_unprocessed_files}")
    logging.info(f"Missing files: {missing_files}")

# Call the function with the specified parameters
extract_navigation_data(directory_path, navigation_file)

## Navigation'lari ekleme
### Navigation bilgisi olmayan ham data'da her bir txt dosyasinin ikinci satirina
### path ekliyor. Eger path zaten varsa ekleme yapmiyor.

In [None]:
import os

# Define paths for the directories and files
data_directory = "data"  # Directory containing the .txt files to be updated
navigation_file = "web_data/navigation.txt"  # File containing navigation data

def load_navigation_data(navigation_file):
    """
    Loads navigation data from the navigation_file into a dictionary.
    
    Parameters:
    - navigation_file (str): Path to the file containing filenames and navigation data.
    
    Returns:
    - dict: A dictionary where keys are filenames and values are navigation data.
    """
    navigation_data = {}
    
    try:
        # Open the navigation file and read lines
        with open(navigation_file, "r", encoding="utf-8") as file:
            lines = file.readlines()
            
            # Process lines in pairs: filename and navigation data
            for i in range(0, len(lines), 2):
                filename = lines[i].strip()  # Filename is the first line of each pair
                navigation = lines[i + 1].strip() if i + 1 < len(lines) else ""  # Navigation data is the second line
                navigation_data[filename] = navigation
    
    except Exception as e:
        print(f"Error reading navigation file: {e}")
    
    return navigation_data

def append_navigation_to_files(data_directory, navigation_data):
    """
    Appends navigation data to .txt files in the data_directory based on the provided navigation_data mapping.
    
    Parameters:
    - data_directory (str): Path to the directory containing .txt files to be updated.
    - navigation_data (dict): Dictionary where keys are filenames and values are navigation data.
    """
    try:
        # Iterate through each file in the directory
        for filename in os.listdir(data_directory):
            if filename.endswith(".txt"):
                file_path = os.path.join(data_directory, filename)
                
                # Adjust filename key to match with navigation data (handle 'https_' prefix)
                key_filename_with_https = filename if filename.startswith("https_") else "https_" + filename
                key_filename_without_https = filename[6:] if filename.startswith("https_") else filename
                
                # Determine the correct key for navigation data
                if key_filename_with_https in navigation_data:
                    navigation_key = key_filename_with_https
                elif key_filename_without_https in navigation_data:
                    navigation_key = key_filename_without_https
                else:
                    navigation_key = None
                
                if navigation_key:
                    print(f"Appending navigation to {file_path}...")  # Debugging output
                    try:
                        # Open the file in read and write mode
                        with open(file_path, "r+", encoding="utf-8") as file:
                            content = file.readlines()  # Read all lines of the file
                            
                            # Check if navigation data is already present
                            if len(content) > 1 and content[1].startswith("Telekom"):
                                print(f"Navigation data already present in {file_path}")
                                continue  # Skip appending if navigation data is already there

                            # Insert navigation data after the first line
                            if "Source URL:" in content[0]:
                                content.insert(1, navigation_data[navigation_key] + "\n")
                            else:
                                content.insert(0, navigation_data[navigation_key] + "\n")

                            # Write updated content back to the file
                            file.seek(0)  # Move to the beginning of the file
                            file.writelines(content)
                            print(f"Successfully appended navigation to {file_path}")
                    except Exception as e:
                        print(f"Error appending to file {file_path}: {e}")
                else:
                    print(f"No navigation data found for {filename}")
    
    except Exception as e:
        print(f"Error accessing directory or files: {e}")

# Load navigation data from the navigation file
navigation_data = load_navigation_data(navigation_file)

# Append navigation data to files in the data directory
append_navigation_to_files(data_directory, navigation_data)

print("Script completed.")

## Kategorilere ayirma
### önemli!!: bu güncel olmali. data_directory = "/Users/taha/Desktop/rag/rag_data/website/data", output_directory = "/Users/taha/Desktop/rag/data"
### kategorisine göre txt dosyalarini grupluyor.

In [2]:
import os
import shutil

# Define paths
#data_directory = "data"
#output_directory = "organized_data"

# Define paths
data_directory = "/Users/taha/Desktop/rag/rag_data/website/data"
output_directory = "/Users/taha/Desktop/rag/data"

# List of valid folder names
folder_names = [
    "Geräte & Zubehör",
    "Hilfe bei Störungen",
    "Internet & Telefonie",
    "MagentaEINS",
    "Mobilfunk",
    "TV",
    "Vertrag & Rechnung",
    "Apps & Dienste"
]

def setup_directories(base_directory, folders):
    """
    Create the base directory and subdirectories based on the folder names provided.
    
    Parameters:
    - base_directory (str): The base directory where folders will be created.
    - folders (list of str): List of folder names to create.
    """
    os.makedirs(base_directory, exist_ok=True)
    for folder in folders:
        os.makedirs(os.path.join(base_directory, folder), exist_ok=True)
    # Create the 'Others' directory
    os.makedirs(os.path.join(base_directory, "Others"), exist_ok=True)

def extract_navigation_part(navigation_text):
    """
    Extracts the navigation part of the text up to and including the third '>' character.
    
    Parameters:
    - navigation_text (str): The navigation text to extract part from.
    
    Returns:
    - str: The extracted navigation part.
    """
    parts = navigation_text.split('>')
    
    # Ensure there are at least three '>' characters
    if len(parts) >= 3:
        # Combine parts up to and including the third '>'
        return ' > '.join(part.strip() for part in parts[:3]) + ' >'
    else:
        return navigation_text.strip()

def organize_files(directory_path, output_directory, valid_folders):
    """
    Organizes .txt files into directories based on the navigation text up to and including the third '>' character.
    
    Parameters:
    - directory_path (str): Path to the directory containing .txt files.
    - output_directory (str): Path to the directory where organized files will be saved.
    - valid_folders (list of str): List of valid folder names.
    """
    setup_directories(output_directory, valid_folders)
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    lines = file.readlines()
                    
                    # Check if there are at least two lines in the file
                    if len(lines) >= 2:
                        # Read the second line as the navigation text
                        navigation_text = lines[1].strip()
                        
                        # Extract the part of the navigation text up to and including the third '>'
                        navigation_part = extract_navigation_part(navigation_text)
                        
                        # Determine the appropriate folder
                        target_folder = "Others"
                        for folder in valid_folders:
                            if folder in navigation_part:
                                target_folder = folder
                                break
                        
                        # Copy the file to the appropriate folder
                        target_directory = os.path.join(output_directory, target_folder)
                        shutil.copy(file_path, os.path.join(target_directory, filename))
                        print(f"Copied {filename} to {target_directory}")
                    else:
                        print(f"File {filename} does not have enough lines to extract navigation data.")
            
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

# Run the function with the specified parameters
organize_files(data_directory, output_directory, folder_names)

FileNotFoundError: [Errno 2] No such file or directory: 'rag/rag_data/website/data'