# Web data from __[telekom.de](https://www.telekom.de/hilfe)__


### Description
This function processes `.txt` files in a specified directory to extract relevant information and write it to different output files based on their content. It categorizes the files into three categories:
1. **Files with valid questions and answers**: Written to `web_data.txt`.
2. **Files that could not be processed**: Written to `filtered.txt`.
3. **Files with questions containing `?` but not ending with `?`**: Written to `inspecting.txt`.

### Inputs
- **`directory_path`**: Path to the directory containing `.txt` files.
- **`web_data_file`**: Path to the output file for valid questions and answers.
- **`filtered_file`**: Path to the output file for files that couldn't be processed.
- **`inspecting_file`**: Path to the output file for questions containing `?` but not ending with `?`.

### Outputs
- **`web_data.txt`**: Contains the valid question and answer data.
- **`filtered.txt`**: Contains filenames of files that couldn't be processed.
- **`inspecting.txt`**: Contains questions with `?` but not ending with `?` along with their details.


In [None]:
import re
import os

# Define fixed paths for the input directory and output files
directory_path = "/Users/taha/Desktop/scrapeV2/output_folder"
web_data_directory = "web_data"
web_data_file = os.path.join(web_data_directory, "web_data.txt")
filtered_file = os.path.join(web_data_directory, "filtered.txt")
inspecting_file = os.path.join(web_data_directory, "inspecting.txt")

def extract_and_write_data(directory_path, web_data_file, filtered_file, inspecting_file):
    """
    Extracts and categorizes data from .txt files in the specified directory.
    
    Parameters:
    - directory_path (str): Path to the directory containing .txt files.
    - web_data_file (str): Path to the output file for valid questions and answers.
    - filtered_file (str): Path to the output file for files that couldn't be processed.
    - inspecting_file (str): Path to the output file for questions containing `?` but not ending with `?`.
    
    Outputs:
    - Writes valid questions and answers to web_data_file.
    - Writes filenames of unprocessed files to filtered_file.
    - Writes questions with `?` but not ending with `?` to inspecting_file.
    """

    # Regex pattern to find the section starting with "...Telekom" and ending with two spaces
    pattern = r"\.\.\.Telekom.*?\s{2}"
    
    # Initialize counters and lists to keep track of files and their processing
    total_files = 0
    processed_files = 0
    processed_files_data = []
    unprocessed_files = []
    inspecting_files = []

    def process_file(filename):
        """
        Processes each .txt file to extract and categorize content based on patterns.
        
        Parameters:
        - filename (str): The name of the file to be processed.
        """
        nonlocal processed_files
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as source:
                content = source.read()
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    processed_files += 1
                    matched_text = match.group(0)
                    start_index = match.end()

                    # Clean and format the matched text
                    cleaned_text = re.sub(r'\.\.\.+', '\n', matched_text.strip(".").strip())
                    
                    # Extract the text after the matched pattern
                    post_pattern_text = content[start_index:]
                    
                    # Find paragraphs separated by multiple newlines
                    paragraph_pattern = r'([^\n]+(?:\n[^\n]+)*)(?:\n{2,})'
                    paragraphs = re.findall(paragraph_pattern, post_pattern_text)
                    
                    if len(paragraphs) >= 2:
                        question = paragraphs[0].strip()
                        answer = paragraphs[1].strip()
                        
                        if question.endswith('?'):
                            processed_files_data.append((filename, cleaned_text, question, answer))
                        else:
                            if '?' in question:
                                inspecting_files.append(f"File: {filename}\nNavigation:\n{cleaned_text}\n\nQuestion: {question}\n\nAnswer: {answer}\n")
                    else:
                        unprocessed_files.append(filename)
                else:
                    unprocessed_files.append(filename)
        except Exception as e:
            unprocessed_files.append(filename)

    # Check if the directory exists
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
        return

    # Create web_data directory if it does not exist
    os.makedirs(web_data_directory, exist_ok=True)

    # Process each .txt file in the directory
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith(".txt"):
            total_files += 1
            process_file(filename)

    # Write valid question and answer data to web_data_file
    with open(web_data_file, "w", encoding="utf-8") as web_data_txt:
        for filename, navigation, question, answer in sorted(processed_files_data, key=lambda x: x[0]):
            web_data_txt.write(f"File: {filename}\n")
            web_data_txt.write(f"Navigation:\n{navigation}\n\n")
            web_data_txt.write(f"Question: {question}\n\n")
            web_data_txt.write(f"Answer: {answer}\n")
            web_data_txt.write("\n" + "="*40 + "\n")

    # Write filenames of unprocessed files to filtered_file
    with open(filtered_file, "w", encoding="utf-8") as filtered_txt:
        for filename in sorted(unprocessed_files):
            filtered_txt.write(f"{filename}\n")

    # Write questions with `?` but not ending with `?` to inspecting_file
    with open(inspecting_file, "w", encoding="utf-8") as inspecting_txt:
        inspecting_txt.writelines(inspecting_files)

    # Calculate numbers for print statements
    num_unprocessed_files = len(unprocessed_files)
    num_inspecting_files = len(inspecting_files)
    num_processed_files = len(processed_files_data)
    missing_files = total_files - (num_processed_files + num_unprocessed_files + num_inspecting_files)

    # Print summary of the processing
    print("Process completed.")
    print(f"Total number of .txt files in the folder: {total_files}")
    print(f"Number of .txt files processed and written to {web_data_file}: {num_processed_files}")
    print(f"Unprocessed files have been written to {filtered_file}. Number of unprocessed files: {num_unprocessed_files}")
    print(f"Files with questions containing '?' but not ending with '?' have been written to {inspecting_file}. Number of inspecting files: {num_inspecting_files}")
    print(f"Number of missing or unaccounted files: {missing_files}")

# Call the function with the specified parameters
extract_and_write_data(directory_path, web_data_file, filtered_file, inspecting_file)


### HTML üzerinden metinleri arayan bir kod, calisiyor incelenecek...

In [None]:
import os
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import re

# 1. Sitemap'ten URL'leri alma
sitemap_url = 'https://www.telekom.de/ueber-das-unternehmen/robots/sitemap'
print(f"Sitemap URL'sine istek gönderiliyor: {sitemap_url}")
response = requests.get(sitemap_url)

# XML parser olarak 'lxml' kullanarak sitemap içeriğini işleme
soup = BeautifulSoup(response.content, 'xml')
print("Sitemap XML içeriği işleniyor...")

# 'loc' etiketlerini bulma
urls = [url.text for url in soup.find_all('loc') if url is not None]
print(f"{len(urls)} URL bulundu.")

def extract_question_answer(soup):
    question_answer_pairs = []
    
    # Cevapları toplamak için kullanılan liste
    def get_text_from_element(element):
        text = ''
        # <p> ve <ul><li> elementlerinden metinleri toplama
        for p in element.find_all('p'):
            text += p.get_text(strip=True) + '\n'
        for ul in element.find_all('ul'):
            for li in ul.find_all('li'):
                text += f"• {li.get_text(strip=True)}\n"
        return text.strip()

    # Hariç tutulacak sınıflar
    excluded_classes = [
        "chf-navigation-bar",
        "direct-access-container",
        "direct-access-content",
        "collection-wrapper collection collection-standard",
        "collection-wrapper collection collection-standard l-outer l-outer--solutionPage"
    ]
    
    # Hariç tutulacak sınıflara sahip elemanları kaldır
    def remove_excluded_elements(soup):
        for class_name in excluded_classes:
            for element in soup.find_all(class_=class_name):
                if element:  # Element geçerli olup olmadığını kontrol et
                    element.decompose()
    
    # Elementlerin içeriğini de kontrol etme
    def remove_nested_excluded_elements(soup):
        for element in soup.find_all(True):
            if isinstance(element, BeautifulSoup):  # Elementin BeautifulSoup nesnesi olup olmadığını kontrol et
                classes = element.get('class', [])
                if classes and any(cls in ' '.join(classes) for cls in excluded_classes):
                    element.decompose()
    
    remove_excluded_elements(soup)
    remove_nested_excluded_elements(soup)

    # Soru ve cevapları belirli elementlerden çekme
    questions = soup.find_all(['h1', 'h2', 'h3'])
    
    for question in questions:
        question_text = question.get_text(strip=True)
        if question_text.endswith('?'):
            answer_text = ''
            # Önce div.outerRichtextDiv içinde cevap arama
            next_div = question.find_next('div', class_='outerRichtextDiv')
            if next_div:
                answer_text = get_text_from_element(next_div)
            # Eğer div.outerRichtextDiv içinde cevap bulunamadıysa, diğer elemanlarda arama
            if not answer_text:
                next_div = question.find_next('div')
                if next_div and not any(cls in ' '.join(next_div.get('class', [])) for cls in excluded_classes):
                    answer_text = get_text_from_element(next_div)
            if answer_text:
                question_answer_pairs.append({'question': question_text, 'answer': answer_text})

    return question_answer_pairs

# 3. Tüm URL'leri ziyaret edip soru-cevapları çekme ve yalnızca soru-cevap içeren sayfalar için dosya oluşturma
output_dir = Path("data")
output_dir.mkdir(parents=True, exist_ok=True)  # Klasör oluşturma

for idx, url in enumerate(urls, 1):
    print(f"{idx}/{len(urls)} URL işleniyor: {url}")
    
    try:
        # Redirect döngüsünden kaçınmak için yönlendirmeleri devre dışı bırak
        response = requests.get(url, allow_redirects=False)
        
        # Eğer yönlendirme varsa, durum kodunu kontrol et
        if response.status_code == 301 or response.status_code == 302:
            print("   Yönlendirme tespit edildi, URL kontrol ediliyor.")
            final_url = response.headers.get('Location')
            if final_url:
                response = requests.get(final_url)
        elif response.status_code == 200:
            response = requests.get(url)
        else:
            print("   Hatalı URL veya erişim problemi.")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        qa_pairs = extract_question_answer(soup)
        print(f"   {len(qa_pairs)} soru-cevap çifti bulundu.")
        
        if qa_pairs:
            file_name = re.sub(r'\W+', '_', url) + ".txt"
            output_file = output_dir / file_name
            
            with open(output_file, "w", encoding="utf-8") as file:
                file.write(f"Kaynak URL: {url}\n\n")
                for idx, qa in enumerate(qa_pairs, 1):
                    file.write(f"{idx}. Soru: {qa['question']}\n   Cevap: {qa['answer']}\n\n")
            
            print(f"   Sonuçlar '{output_file}' dosyasına kaydedildi.")
        else:
            print("   Soru-cevap çifti bulunamadı, dosya oluşturulmayacak.")
    
    except requests.exceptions.RequestException as e:
        print(f"   Hata oluştu: {e}")

print("İşlem tamamlandı! Sadece soru-cevap içeren sayfalar için sonuçlar 'data' klasörüne kaydedildi.")
