In [15]:
def to_sql_set(strings: Set[str]) -> str:
    """
    Converts a set of strings to a SQL-compatible format for use in an IN clause.

    Parameters:
        strings (Set[str]): A set of strings to be converted into SQL set notation.

    Returns:
        str: A formatted string representing the set in SQL format, suitable for use
             in an IN clause (e.g., "('value1', 'value2', 'value3')").
             
    Example:
        >>> to_sql_set({"GETESI20214-9", "GETESI20214-10"})
        "('GETESI20214-9', 'GETESI20214-10')"
    """
    sql_set = ",\n  ".join(f"'{s}'" for s in strings)
    return f"(\n  {sql_set}\n)"

In [17]:
# Define a set of strings
string_set = {"GETESI20214-9", "GETESI20214-10", "GETESI20214-11", "GETEHBT20245-73"}

# Convert to SQL-compatible set format
sql_set = to_sql_set(string_set)

# Output for use in a Timestream query
print(f"SELECT * FROM your_table WHERE client_id IN {sql_set}")

SELECT * FROM your_table WHERE client_id IN (
  
)


# Web extactor


In [1]:
!pip install requests beautifulsoup4



In [44]:
base_url = 'https://www.erwinhymergroup.com/de'


## extract all links

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_subpages(base_url):
    try:
        # Send a request to the base URL
        response = requests.get(base_url)
        response.raise_for_status()  # Check if the request was successful

        # Parse the content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <a> tags with href attribute
        links = soup.find_all('a', href=True)

        # Extract and join relative URLs with the base URL
        subpages = {urljoin(base_url, link['href']) for link in links}

        return subpages

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return set()

#if __name__ == "__main__":
#    base_url = 'https://example.com'  # Replace with your base URL
#    subpages = extract_subpages(base_url)
#    print("Subpages found:")
#    for subpage in subpages:
#        print(subpage)

Subpages found:
https://www.iana.org/domains/example


In [6]:
#base_url = 'https://www.erwinhymergroup.com/de'
subpages = extract_subpages(base_url)
print("Subpages found:")
for subpage in subpages:
    print(subpage)

Subpages found:
https://www.erwinhymergroup.com/de/karriere/stellenangebote/folierer-mitarbeiter-bereich-folierung~j~2969549
https://www.erwinhymergroup.com/de/karriere/stellenangebote/ausbildung-als-holzmechaniker-mwd~j~2951918
https://www.erwinhymergroup.com/de/presse/pressrelease/~p~3019340
https://www.erwinhymergroup.com/de
https://www.erwinhymergroup.com/de/karriere/stellenangebote/schuelerpraktikum-mwd~j~980096
https://www.erwinhymergroup.com/de/karriere/stellenangebote/teamleiter-vorfertigung-moebel-w-m-d~j~3035043
https://www.erwinhymergroup.com/de/karriere/stellenangebote/pflichtpraktikum-fuer-studierende-w-m-d-ab-dem-sommersemester-oder-wintersemester~j~1178209
https://www.erwinhymergroup.com/de/karriere/stellenangebote/mitarbeiter-wareneinlagerung-logistik-m-w-d~j~1612986
https://www.erwinhymergroup.com/de/karriere/stellenangebote/mitarbeiter-kd-werkstatt-m-w-d~j~1444469
https://www.erwinhymergroup.com/de/karriere/berufserfahrene
https://www.erwinhymergroup.com/de/karriere/s

## extract all text content

In [68]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import re

def fetch_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_links(base_url, soup):
    links = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href)
        # Ensure the link is within the same domain and does not contain "karierre"
        if base_url in full_url and "karierre" not in full_url.lower():
            links.add(full_url)
    return links

def scrape_site(
    base_url, 
    visited=None,
    depth=0,
    exclude_url_strings={}
):
    if visited is None:
        print(f"{base_url = }\n")
        visited = set()
        visited.add(base_url)
    print(base_url, '\n==============')

    # Fetch and parse the base page
    content = fetch_page_content(base_url)
    if content is None:
        return visited, ""

    soup = BeautifulSoup(content, 'html.parser')
    text_content = soup.get_text(separator="\n")

    # Format the content with Markdown heading based on depth
    markdown_content = f"{'#' * (depth + 1)} {base_url}\n\n{text_content}\n\n"

    # Extract links and scrape subpages recursively
    links = extract_links(base_url, soup)
    for link in links:
        if (link not in visited) and (link.split('#')[0] not in visited):
            visited.add(link)
            if string_contains_one_from_set(link, exclude_url_strings):
                #print("  excluded")
                continue
            sub_visited, sub_text = scrape_site(
                link, 
                visited, 
                depth + 1,
                exclude_url_strings
            )
            visited = visited.union(sub_visited)
            markdown_content += sub_text

    return visited, markdown_content

def scrape_webpage(
    base_url, 
    exclude_url_strings={}
):
    visited_pages, all_text_content = scrape_site(
        base_url,
        visited=None,
        exclude_url_strings=exclude_url_strings
    )

    all_text_content = remove_multiple_newlines(all_text_content)
    
    # Store the base page URL in a file
    file_path = generate_file_name(base_url, '.md')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(all_text_content)

    return visited_pages, all_text_content

def string_contains_one_from_set(
    string_to_check:str, 
    set_strings:set
):
    #print(f"""  string_contains_one_from_set({string_to_check}, {set_strings})""")
    for exlude_str in set_strings:
        #print(f"    {exlude_str=}")
        if exlude_str in string_to_check:
            #print(True)
            return True
    return False

def generate_file_name(base_url, file_extention = '.txt'):
    # Remove the scheme (http, https) and replace non-alphanumeric characters with underscores
    base_file_name = re.sub(r'[^a-zA-Z0-9]', '_', base_url)
    return base_file_name + file_extention

def remove_multiple_newlines(text):
    # Replace multiple consecutive newlines with a single newline
    cleaned_text = re.sub(r'\n+', '\n', text)
    return cleaned_text

#if __name__ == "__main__":
#    base_url = 'https://example.com'  # Replace with your base URL
#    visited_pages, all_text_content = scrape_site(base_url)
#    print("Visited pages:")
#    for page in visited_pages:
#        print(page)
#    print("\nAll text content:")
#    print(all_text_content)

In [69]:
#base_url = 'https://example.com'  # Replace with your base URL

visited_pages, all_text_content = scrape_webpage(
    base_url,
    exclude_url_strings={'karriere'}
)
#print("Visited pages:")
#for page in visited_pages:
#    print(page)
print("\nAll text content:")
print(all_text_content)

base_url = 'https://www.erwinhymergroup.com/de'

https://www.erwinhymergroup.com/de 
https://www.erwinhymergroup.com/de/presse/pressrelease/~p~3019340 
https://www.erwinhymergroup.com/de/unternehmen/erwin-hymer-museum 
https://www.erwinhymergroup.com/de/presse/pressrelease/~p~2200012 
https://www.erwinhymergroup.com/de/unternehmen/podcast 
https://www.erwinhymergroup.com/de/footer/kontakt 
https://www.erwinhymergroup.com/de/verantwortung 
https://www.erwinhymergroup.com/de/verantwortung/oekonomische-verantwortung 
https://www.erwinhymergroup.com/de/verantwortung/oekonomische-verantwortung/compliance/hinweisgeberrichtlinie 
https://www.erwinhymergroup.com/de/verantwortung/oekonomische-verantwortung/compliance 
https://www.erwinhymergroup.com/de/verantwortung/soziale-verantwortung 
https://www.erwinhymergroup.com/de/verantwortung/oekologische-verantwortung 
https://www.erwinhymergroup.com/de/marken/finanzierung 
https://www.erwinhymergroup.com/de/presse/pressrelease/~p~2306275 
https://w