In [4]:
# import aiohttp
# import asyncio
# import pandas as pd
# from bs4 import BeautifulSoup
# from tqdm.asyncio import tqdm
# import nest_asyncio  # Import nest_asyncio
# import time

# # Define the main sitemap URL
# MAIN_SITEMAP_URL = "https://www.utdallas.edu/wp-sitemap.xml"
# CRAWL_DELAY = 30  # UTDallas crawl delay is 30 seconds

# async def fetch(session, url):
#     """Fetch content from a URL asynchronously."""
#     try:
#         async with session.get(url, timeout=10) as response:
#             return await response.text()
#     except Exception as e:
#         print(f"Error fetching {url}: {e}")
#         return None

# async def get_sub_sitemaps():
#     """Get sub-sitemaps from the main sitemap."""
#     async with aiohttp.ClientSession() as session:
#         sitemap_xml = await fetch(session, MAIN_SITEMAP_URL)
#         if not sitemap_xml:
#             return []

#         soup = BeautifulSoup(sitemap_xml, "xml")
#         return [loc.text for loc in soup.find_all("loc")]

# async def get_page_urls(sub_sitemap_url):
#     """Get all page URLs from a sub-sitemap."""
#     async with aiohttp.ClientSession() as session:
#         sitemap_xml = await fetch(session, sub_sitemap_url)
#         if not sitemap_xml:
#             return []

#         soup = BeautifulSoup(sitemap_xml, "xml")
#         return [loc.text for loc in soup.find_all("loc")]

# async def scrape_page(session, url):
#     """Scrape a single page."""
#     html = await fetch(session, url)
#     if not html:
#         return None

#     soup = BeautifulSoup(html, "html.parser")
#     title = soup.title.string if soup.title else "No Title"
#     content = " ".join([p.get_text(strip=True) for p in soup.find_all("p")])

#     return {"URL": url, "Title": title, "Content": content}

# async def scrape_all():
#     """Scrape all pages with controlled concurrency."""
#     sub_sitemaps = await get_sub_sitemaps()

#     # Get all page URLs from sub-sitemaps
#     page_urls = []
#     for sub_sitemap in tqdm(sub_sitemaps, desc="Extracting page URLs"):
#         page_urls.extend(await get_page_urls(sub_sitemap))

#     print(f"Total URLs found: {len(page_urls)}")

#     # Scrape pages in batches with delay
#     scraped_data = []
#     async with aiohttp.ClientSession() as session:
#         for i in range(0, len(page_urls), 20):  # Scrape in batches of 10
#             batch_urls = page_urls[i : i + 20]

#             tasks = [scrape_page(session, url) for url in batch_urls]
#             results = await tqdm.gather(*tasks)

#             scraped_data.extend(filter(None, results))

#             # Save to CSV incrementally
#             df = pd.DataFrame(scraped_data)
#             df.to_csv("utdallas_scraped_data.csv", index=False)

#             print(f"✅ Saved {len(scraped_data)} records to CSV")
#             time.sleep(CRAWL_DELAY)  # Respect 30s delay

#     print("🎉 Scraping complete!")

# nest_asyncio.apply()  # Apply nest_asyncio patch
# # Run the scraping process
# asyncio.run(scrape_all())


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Sitemap URL
sitemap_url = "https://jindal.utdallas.edu/sitemap/"

# Step 1: Fetch the Sitemap Page
response = requests.get(sitemap_url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
    print("Failed to fetch the sitemap")
    exit()

soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Extract URLs from <a> tags
urls = [a["href"] for a in soup.find_all("a", href=True) if a["href"].startswith("https")]

# Step 3: Visit Each URL and Extract Structured Data
data = []
for idx, url in enumerate(urls):  # Limit to first 10 URLs for testing
    try:
        print(f"Processing [{idx+1}/{len(urls)}]: {url}")

        # Fetch page content
        page_response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
        if page_response.status_code != 200:
            print(f"Skipping {url} (Failed to fetch)")
            continue

        page_soup = BeautifulSoup(page_response.text, "html.parser")

        # Extract Page Title (Main topic)
        main_title = page_soup.title.string.strip() if page_soup.title else "No Title Found"

        # Extract Sections (Headings and Content)
        for heading in page_soup.find_all(["h1", "h2", "h3"]):  # Extract main & subheadings
            section_title = heading.get_text(strip=True)
            content = []

            # Get all text after this heading until the next heading
            for sibling in heading.find_next_siblings():
                if sibling.name and sibling.name.startswith("h"):  # Stop at next heading
                    break
                if sibling.name in ["p", "div", "li"]:
                    text = sibling.get_text(strip=True)
                    if text and len(text) > 50:  # Avoid short/noisy text
                        content.append(text)

            if content:
                data.append({"URL": url, "MainTitle": main_title, "SectionTitle": section_title, "Content": " ".join(content)})

        time.sleep(1)  # Avoid overwhelming the server

    except Exception as e:
        print(f"Error processing {url}: {e}")

# Step 4: Save Data to CSV
df = pd.DataFrame(data)
df.to_csv("jindal_sitemap_lex_all.csv", index=False, encoding="utf-8")

print("✅ Data extraction complete! Saved to 'jindal_sitemap_lex.csv'")


Processing [1/1120]: https://utdallas.edu
Processing [2/1120]: https://jindal.utdallas.edu/staff-list/
Processing [3/1120]: https://galaxy.utdallas.edu/
Processing [4/1120]: https://www.utdallas.edu/directory
Processing [5/1120]: https://elearning.utdallas.edu/
Processing [6/1120]: https://map.concept3d.com/?id=1772#!m/435245?ce/42138?bm/
Processing [7/1120]: https://www.utdallas.edu/academics/calendar/
Processing [8/1120]: https://coursebook.utdallas.edu/
Processing [9/1120]: https://jindal.utdallas.edu/admission-requirements/
Processing [10/1120]: https://jindal.utdallas.edu/request-information/
Processing [11/1120]: https://jindal.utdallas.edu/about-the-jindal-school-of-management/
Processing [12/1120]: https://jindal.utdallas.edu/about-the-jindal-school-of-management/
Processing [13/1120]: https://jindal.utdallas.edu/about-the-jindal-school-of-management/community-engagement/
Processing [14/1120]: https://jindal.utdallas.edu/conferences/
Processing [15/1120]: https://jindal.utdalla

In [None]:
# # Step 4: Save data to CSV
# df = pd.DataFrame(scraped_data)
# df.to_csv("utdallas_scraped_data.csv", index=False)

# print("Scraping completed! Data saved to utdallas_scraped_data.csv")

NameError: name 'scraped_data' is not defined

In [None]:
df.head(50)

NameError: name 'df' is not defined

In [15]:
from google.colab import files
# files.download("utdallas_scraped_data.csv")
files.download("jindal_sitemap_lex_all.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL to track internal links
base_url = "https://jindal.utdallas.edu"

# Track visited URLs to avoid duplicates
visited_urls = set()

# Data storage
data = []

def extract_data(url):
    """Fetch page, extract titles, content, and find more links recursively."""
    if url in visited_urls or not url.startswith(base_url):  # Skip if already visited or external
        return

    print(f"Processing: {url}")
    visited_urls.add(url)

    try:
        # Fetch page content
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
        if response.status_code != 200:
            print(f"Skipping {url} (Failed to fetch)")
            return

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract Page Title (Main topic)
        main_title = soup.title.string.strip() if soup.title else "No Title Found"

        # Extract Sections (Headings and Content)
        for heading in soup.find_all(["h1", "h2", "h3"]):  # Extract main & subheadings
            section_title = heading.get_text(strip=True)
            content = []

            # Get all text after this heading until the next heading
            for sibling in heading.find_next_siblings():
                if sibling.name and sibling.name.startswith("h"):  # Stop at next heading
                    break
                if sibling.name in ["p", "div", "li"]:
                    text = sibling.get_text(strip=True)
                    if text and len(text) > 50:  # Avoid short/noisy text
                        content.append(text)

            if content:
                data.append({"URL": url, "MainTitle": main_title, "SectionTitle": section_title, "Content": " ".join(content)})

        # Find all sub-links on the page and process them recursively
        for link in soup.find_all("a", href=True):
            sub_url = link["href"]
            if sub_url.startswith("/"):  # Convert relative URLs to absolute
                sub_url = base_url + sub_url
            if sub_url.startswith(base_url) and sub_url not in visited_urls:
                extract_data(sub_url)  # Recursive call

        time.sleep(1)  # Avoid overwhelming the server

    except Exception as e:
        print(f"Error processing {url}: {e}")

# Step 1: Fetch the main sitemap page
sitemap_url = base_url + "/sitemap/"
response = requests.get(sitemap_url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code != 200:
    print("Failed to fetch the sitemap")
    exit()

soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Extract main URLs from the sitemap
main_urls = [a["href"] for a in soup.find_all("a", href=True) if a["href"].startswith("https")]

# Step 3: Start the recursive extraction from the main URLs
for main_url in main_urls[:10]:  # Limit initial run for testing
    extract_data(main_url)

# Step 4: Save Data to CSV
df = pd.DataFrame(data)
df.to_csv("jindal_sitemap_recursive.csv", index=False, encoding="utf-8")

print("✅ Full recursive data extraction complete! Saved to 'jindal_sitemap_recursive.csv'")


Processing: https://jindal.utdallas.edu/staff-list/
Processing: https://jindal.utdallas.edu/admission-requirements/
Processing: https://jindal.utdallas.edu/request-information/
Processing: https://jindal.utdallas.edu/jindal/about-the-jindal-school-of-management/business-school-rankings
Processing: https://jindal.utdallas.edu/jindal/news
Processing: https://jindal.utdallas.edu/jindal/career-management-center/
Processing: https://jindal.utdallas.edu/jindal/calendar
Processing: https://jindal.utdallas.edu/jindal/external-relations/
Processing: https://jindal.utdallas.edu/jindal/about-the-jindal-school-of-management/community-engagement/
Processing: https://jindal.utdallas.edu/jindal/giving-jsom/
Processing: https://jindal.utdallas.edu/about-the-jindal-school-of-management/
Processing: https://jindal.utdallas.edu/about-the-jindal-school-of-management/community-engagement/
Processing: https://jindal.utdallas.edu/conferences/
Processing: https://jindal.utdallas.edu/about-the-jindal-school-of

KeyboardInterrupt: 