In [6]:
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup

# Create a simple extractor function
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

# Configure the loader
loader = RecursiveUrlLoader(
    url="https://www.genetechsolutions.com/",
    max_depth=1,
    extractor=bs4_extractor,
    use_async=False,  # Easier to debug
    timeout=10
)

# Load documents
documents = loader.load()

# Print results
print(f"\n✅ Total documents fetched: {len(documents)}\n")
for i, doc in enumerate(documents, start=1):
    print(f"{i}. {doc.metadata['source']}")


✅ Total documents fetched: 1

1. https://www.genetechsolutions.com/


In [None]:
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
import os

# Create output directory if it doesn't exist
os.makedirs("research", exist_ok=True)

# Define extractor function using BeautifulSoup
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

# Configure the recursive URL loader
loader = RecursiveUrlLoader(
    url="https://www.genetechsolutions.com/testimonials",
    max_depth=1,
    extractor=bs4_extractor,
    use_async=False,  # Set False for easier debugging
    timeout=10
)

# Load the documents from the website
documents = loader.load()

# Save the scraped text content to research/text.txt
output_path = "research/text.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for i, doc in enumerate(documents, start=1):
        source = doc.metadata.get("source", "Unknown source")
        content = doc.page_content.strip()
        f.write(f"--- Document {i} ---\n")
        f.write(f"Source: {source}\n\n")
        f.write(content + "\n\n")

print(f"\n✅ Total documents fetched: {len(documents)}")
print(f"📄 Scraped data saved to: {output_path}")



✅ Total documents fetched: 1
📄 Scraped data saved to: research/text.txt


In [None]:
https://www.genetechsolutions.com/jobs

In [5]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os

# Set up headless Chrome browser
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

# Visit the jobs page
driver.get("https://www.genetechsolutions.com/jobs")
time.sleep(5)  # wait for JavaScript to load fully

# Extract the page source after rendering
html = driver.page_source
driver.quit()

# Parse and extract text
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()

# Save the text to a file
os.makedirs("research", exist_ok=True)
with open("research/text.txt", "w", encoding="utf-8") as f:
    f.write(text.strip())

print("✅ Content saved from rendered page.")


✅ Content saved from rendered page.
