
# Jindal School Website Scraper
Comprehensive scraper for https://jindal.utdallas.edu with error handling and CSV export



In [2]:
!pip install jupyter_contrib_nbextensions
jupyter contrib nbextension install --user
jupyter nbextension enable --py widgetsnbextension


SyntaxError: invalid syntax (721925013.py, line 2)

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from tqdm.notebook import tqdm
import time
import logging

In [5]:
BASE_URL = "https://jindal.utdallas.edu"
SITEMAP_URL = f"{BASE_URL}/sitemap/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Accept-Language": "en-US,en;q=0.5"
}
REQUEST_DELAY = 1  # Seconds between requests
TIMEOUT = 10  # Seconds

In [6]:
def get_soup(url, session):
    """Get BeautifulSoup object with error handling"""
    try:
        response = session.get(url, timeout=TIMEOUT)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'lxml')
    except Exception as e:
        logging.error(f"Error fetching {url}: {str(e)}")
        return None

def extract_main_content(soup):
    """Extract cleaned main content from page"""
    for element in soup(['script', 'style', 'nav', 'footer', 'header']):
        element.decompose()
        
    main_content = soup.find('main') or soup.body
    return ' '.join(main_content.stripped_strings) if main_content else ''

In [7]:
class JindalScraper:
    def __init__(self):
        self.visited = set()
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        self.data = []
    
    def process_page(self, url):
        """Process individual page and extract data"""
        if url in self.visited:
            return
        self.visited.add(url)

        soup = get_soup(url, self.session)
        if not soup:
            return

        page_data = {
            "url": url,
            "title": soup.title.text.strip() if soup.title else "No Title",
            "content": extract_main_content(soup),
            "links": []
        }

        # Extract and store internal links
        for link in soup.find_all('a', href=True):
            full_url = urljoin(BASE_URL, link['href'])
            if BASE_URL in full_url and full_url not in self.visited:
                page_data["links"].append(full_url)

        self.data.append(page_data)
        time.sleep(REQUEST_DELAY)

    def crawl(self, start_urls):
        """Main crawling function with progress tracking"""
        queue = list(start_urls)
        
        with tqdm(total=len(queue), desc="Scraping Pages") as pbar:
            while queue:
                url = queue.pop(0)
                self.process_page(url)
                
                # Add new links to queue
                new_links = [link for link in self.data[-1]["links"] 
                            if link not in self.visited]
                queue.extend(new_links)
                pbar.total = len(queue)
                pbar.update(1)

In [8]:
if __name__ == "__main__":
    # Initialize scraper
    scraper = JindalScraper()
    
    # Get initial links from sitemap
    sitemap_soup = get_soup(SITEMAP_URL, scraper.session)
    start_urls = list({
        urljoin(BASE_URL, a['href']) 
        for a in sitemap_soup.find_all('a', href=True)
    })
    
    print(f"Found {len(start_urls)} initial URLs in sitemap")
    
    # Start crawling
    scraper.crawl(start_urls)
    
    # Save results
    df = pd.DataFrame(scraper.data)
    df.to_csv("jindal_comprehensive_data.csv", index=False)
    print(f"Scraping complete. Saved {len(df)} records.")

Found 939 initial URLs in sitemap


Scraping Pages:   0%|          | 0/939 [00:00<?, ?it/s]

ERROR:root:Error fetching https://jindal.utdallas.edu/./jsom-conference-archives/part-time-mba-archives/: 404 Client Error: Not Found for url: https://jindal.utdallas.edu/jsom-conference-archives/part-time-mba-archives/
ERROR:root:Error fetching https://jindal.utdallas.edu/./testimonials/research-scholar-testimonials/: 404 Client Error: Not Found for url: https://jindal.utdallas.edu/testimonials/research-scholar-testimonials/
ERROR:root:Error fetching https://jindal.utdallas.edu/./alumni-spotlights/ms-supply-chain-spotlights/: 404 Client Error: Not Found for url: https://jindal.utdallas.edu/alumni-spotlights/ms-supply-chain-spotlights/
ERROR:root:Error fetching https://jindal.utdallas.edu/./jsom-conference-archives/cgb-conference-archives/: 404 Client Error: Not Found for url: https://jindal.utdallas.edu/jsom-conference-archives/cgb-conference-archives/
ERROR:root:Error fetching https://jindal.utdallas.edu/./testimonials/nrf-testimonials/: 404 Client Error: Not Found for url: https://j

KeyboardInterrupt: 