# Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

# Initializing

In [2]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)  # Adjust if using Firefox or specify path to WebDriver

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache
Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


In [4]:
main_url = "https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid1688233"
driver.get(main_url)

# Start

In [30]:
def get_all_sidebar_links(driver):
    # After expanding all items, collect the links
    sidebar_links = driver.find_elements(By.XPATH, "//li[@class='has-subItems']")
    print(sidebar_links)
    links = []
    for link in sidebar_links:
        a_tags = link.find_elements(By.TAG_NAME, "a")
        for a_tag in a_tags:
            link = a_tag.get_attribute("href")
            if link and link not in links:
                links.append(link)
    return links

In [31]:
# Get all links
all_links = get_all_sidebar_links(driver)

[<selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a3d160b241c93", element="f.A428F1A400AC310CD64D94FDC47CA513.d.8BD98866CBE7046D70A3409534D2E8A3.e.5008")>, <selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a3d160b241c93", element="f.A428F1A400AC310CD64D94FDC47CA513.d.8BD98866CBE7046D70A3409534D2E8A3.e.5009")>, <selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a3d160b241c93", element="f.A428F1A400AC310CD64D94FDC47CA513.d.8BD98866CBE7046D70A3409534D2E8A3.e.5010")>, <selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a3d160b241c93", element="f.A428F1A400AC310CD64D94FDC47CA513.d.8BD98866CBE7046D70A3409534D2E8A3.e.5011")>, <selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a3d160b241c93", element="f.A428F1A400AC310CD64D94FDC47CA513.d.8BD98866CBE7046D70A3409534D2E8A3.e.5584")>, <selenium.webdriver.remote.webelement.WebElement (session="d50124e3e004f8c2d36a

# Saving Links

In [32]:
def save_links_to_file(links, filename="Documentation_links.txt"):
    with open(filename, "w") as file:
        for link in links:
            file.write(link + "\n")
    print(f"Saved {len(links)} links to {filename}")

save_links_to_file(all_links)

Saved 638 links to Documentation_links.txt


# Going through links

In [3]:
# Open the file in read mode
with open('Documentation_links.txt', 'r') as file:
    # Read each line and strip any newline characters
    links = [line.strip() for line in file]

# Print the list of links to verify
print(links)

['https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/intro', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/intro_term', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/opt_avail_temp', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid924223', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id947709', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id947716', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id1288936', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid377064', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid377157', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid1926671', 'https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggenera

In [11]:
def extract_data_from_link(driver, link):
    driver.get(link)
    wait = WebDriverWait(driver, 10)
    iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'iframe')))

    # Switch to the iframe
    driver.switch_to.frame(iframe)
    
    # Extract document
    title_div = driver.find_elements(By.XPATH, "//div[contains(@class, 'col-12')]")[0]
    if title_div:
        document_title = title_div.text
    else:
        document_title = ""
    
    body_div = driver.find_elements(By.XPATH, "//div[contains(@class, 'col-12')]")[1]
    if body_div:
        document_body = body_div.text
    else:
        document_body = ""

    # Return the data as a dictionary
    data = {
        'link': link,
        'document_title': document_title,
        'document_body': document_body
    }
    return data

In [12]:
all_data = []

def collect_all_data(driver, links):
    for idx, link in enumerate(links):
        print(f"Processing link {idx+1}/{len(links)}: {link}")
        data = extract_data_from_link(driver, link)
        all_data.append(data)
    return all_data

In [13]:
def save_data_to_csv(data_list, filename="documentation_data.csv"):
    # Prepare data for DataFrame
    rows = []
    for data in data_list:
        # Comments may have multiple entries; join them into a single string
        rows.append({
            'Link': data['link'],
            'Document Title': data['document_title'],
            'Document Body': data['document_body'],
        })
    # Create DataFrame
    df = pd.DataFrame(rows)
    # Save to CSV
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Saved data to {filename}")

In [14]:
all_data = []

In [15]:
test_data = collect_all_data(driver, links)

Processing link 1/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/intro
Processing link 2/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/intro_term
Processing link 3/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/opt_avail_temp
Processing link 4/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid924223
Processing link 5/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id947709
Processing link 6/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id947716
Processing link 7/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/id1288936
Processing link 8/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xid377064
Processing link 9/637: https://docs.sw.siemens.com/en-US/doc/209349590/PL20231101866122454.mfggeneral/xi

In [16]:
print(test_data)



In [17]:
save_data_to_csv(all_data)

Saved data to documentation_data.csv
