In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

# Safari WebDriver başlat
driver = webdriver.Safari()

# Hedef URL
url = "https://telekomhilft.telekom.de/?ms-sort=most_viewed&ms-messagestatusfilter=solved"
driver.get(url)

# Sayfanın yüklenmesi için bekleyelim
time.sleep(5)

# Mesaj URL'lerini tutmak için boş bir liste
message_urls = []

# Kaydırma sayısını ayarlayın
max_messages = 1000  # Maksimum mesaj sayısı

# "Weitere Beiträge laden" butonu görünene kadar döngü
while len(message_urls) < max_messages:
    # Sayfanın en altına kaydır
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Kaydırma sonrası biraz bekleyelim

    # Sayfanın HTML kaynağını alalım
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # Mesajların URL'lerini bulalım
    for div_tag in soup.find_all('div', class_='thc-message-item contenttype-forum'):
        a_tag = div_tag.find('a', class_='telekom-forum-message-link')
        if a_tag:
            link = a_tag.get('href')
            if link.startswith('/'):
                full_url = "https://telekomhilft.telekom.de" + link
            else:
                full_url = link
            if full_url not in message_urls:  # Tekrar eden URL'leri önleyelim
                message_urls.append(full_url)
                
            # Eğer 500 URL'ye ulaşıldıysa döngüyü sonlandır
            if len(message_urls) >= max_messages:
                break  # İç içe döngüden çık

    # Eğer 500 URL'ye ulaşıldıysa ana döngüden çık
    if len(message_urls) >= max_messages:
        break

    # "Weitere Beiträge laden" butonunu kontrol et
    try:
        load_more_button = driver.find_element(By.XPATH, "//a[contains(@class, 'telekom-forum-pager-next')]")
        
        # Buton görünür mü kontrol et
        if load_more_button.is_displayed():
            # Eğer buton görünüyorsa tıklayın
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(5)  # Yeni mesajlar yüklenirken bekleyelim
        else:
            break  # Eğer buton görünmüyorsa döngüden çık

    except Exception as e:
        print("Buton bulunamadı veya başka bir hata meydana geldi. Hata mesajı:", str(e))
        break

# Selenium tarayıcısını kapat
driver.quit()

# URL'leri bir dosyaya yazdıralım
with open('message_urls.txt', 'w') as file:
    for message_url in message_urls:
        file.write(message_url + '\n')

print(f"{len(message_urls)} mesaj URL'si bulundu ve 'message_urls.txt' dosyasına yazıldı.")

933 mesaj URL'si bulundu ve 'message_urls.txt' dosyasına yazıldı.


In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re

# URL file
url_file = 'message_urls.txt'
# Output directory
output_dir = 'data'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to create a valid filename from a URL
def create_filename(url):
    # Remove protocol and query parameters, then replace invalid characters
    filename = re.sub(r'https?://', '', url).split('?')[0]  # Remove 'http://' or 'https://'
    filename = filename.replace('/', '_')  # Replace '/' with '_'
    return filename

# Read URLs from the file
with open(url_file, 'r') as file:
    urls = [line.strip() for line in file.readlines()]

# Define texts to exclude
exclude_texts = ["Gelöst!", "Gehe zu Lösung.", "Lösung in ursprünglichem Beitrag anzeigen"]

# Function to filter out excluded texts
def filter_excluded_texts(text):
    for exclude in exclude_texts:
        text = text.replace(exclude, "")
    return text.strip()

# Process each URL
for url in urls:
    try:
        # Get content from the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Process HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the topic title
        topic_div = soup.find('div', class_='topic-message')
        topic_title = topic_div.find('h1', class_='topic-subject').get_text(strip=True) if topic_div else 'Topic not found'

        # Find the topic content
        topic_content = ''
        content_div = soup.find('div', class_='lia-message-body-content')
        if content_div:
            topic_content = content_div.get_text(strip=True)

        # Find accepted solutions
        solutions = []
        accepted_solutions = soup.find_all('div', class_='lia-message-body-content')
        for solution in accepted_solutions:
            if solution.find('div', class_='lia-message-body-accepted-solution-checkmark'):
                solution_text = solution.get_text(strip=True)
                filtered_solution = filter_excluded_texts(solution_text)
                if filtered_solution:  # Only add non-empty solutions
                    solutions.append(filtered_solution)

        # Create filename based on URL
        filename = create_filename(url)
        output_file_path = os.path.join(output_dir, f"{filename}.txt")
        
        # Write results to a file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(f"Source URL: {url}\n")  # Write the source URL at the top
            output_file.write(f"Subject: {topic_title}\n")
            output_file.write(f"Content: {filter_excluded_texts(topic_content)}\n\n")  # Filter content
            output_file.write("Accepted Solutions:\n")
            for sol in solutions:
                output_file.write(f"- {sol}\n")
        
        print(f"{output_file_path} has been written.")

    except Exception as e:
        print(f"An error occurred ({url}): {e}")

data/telekomhilft.telekom.de_t5_Mobilfunk_Wann-kommt-5G-SA-gibt-es-da-neue-Infos_td-p_6945413_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_Sonstiges_Intervall-Stoerung_td-p_6956428_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_TV_Seit-Gestern-Lautstaerke-nur-von-RTL-Sendern-extrem-leise_td-p_6949692_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_TV_Sky-Highlights-nur-in-englischer-Sprache_td-p_6959933_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_Mobilfunk_iPhone-16-Pro-Max-Versand_td-p_6947273_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_Mobilfunk_Lieferstatus-iPhone-16-Pro_td-p_6962729_jump-to_first-unread-message.txt has been written.
data/telekomhilft.telekom.de_t5_Festnetz-Internet_Kulanzhilfe-von-Telekom-Werbern_td-p_6941739_jump-to_first-unread-message.txt has been written.
data/telekomhilft.tele