In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import time

In [66]:
class WebScraper:
    """
    A web scraper designed to extract product links from e-commerce websites.
    
    Supports:
    - Ajio (Infinite scroll-based pagination)
    - Myntra (Button-based pagination)
    """
    
    def __init__(self, site_id, limit=10, wait_time=5, product_wait_time=2):
        """
        Initializes the web scraper with site-specific configurations.
        
        :param site_id: Identifier for the target website ("ajio" or "myntra")
        :param limit: Maximum number of pages or scrolls to perform (default: 10)
        :param wait_time: Time to wait for page load (default: 5 seconds)
        :param product_wait_time: Time to wait for product page elements to load (default: 2 seconds)
        """
        self.configs = {
            "ajio": {
                "base_url": "https://www.ajio.com",
                "categoryListClass": "menu-newlist",
                "isInfiniteScroll": True,
                "pageClass": "ReactVirtualized__Grid__innerScrollContainer",
                "linkClass": "a",
                "nextPageClass": None
            },
            "myntra": {
                "base_url": "https://www.myntra.com",
                "categoryListClass": "desktop-navContent",
                "isInfiniteScroll": False,
                "pageClass": "results-base",
                "linkClass": "a",
                "nextPageClass": "pagination-next"
            }
        }
        
        if site_id not in self.configs:
            raise ValueError("Unsupported site ID. Add its configuration.")
        
        self.config = self.configs[site_id]
        self.driver = self.init_driver()
        self.all_product_links = []
        self.limit = limit
        self.wait_time = wait_time
        self.product_wait_time = product_wait_time
    
    def init_driver(self):
        """
        Initializes a headless Selenium WebDriver.
        
        :return: A configured WebDriver instance
        """
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
        return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    def fetch_page(self, url, class_name, wait_time=None):
        """
        Loads a webpage and waits for a specified class to appear.
        
        :param url: URL of the page to load
        :param class_name: CSS class to wait for
        :param wait_time: Time to wait for the class to appear (default: self.wait_time)
        :return: True if the page loads successfully, False otherwise
        """
        if wait_time is None:
            wait_time = self.wait_time
        print(f"\U0001F310 Loading page: {url}")
        self.driver.get(url)
        try:
            WebDriverWait(self.driver, wait_time).until(
                EC.presence_of_element_located((By.CLASS_NAME, class_name))
            )
            print("✅ Page loaded successfully!")
            return True
        except Exception:
            print(f"❌ Url: {url} does not contain class: {class_name}")
            return False
    
    def extract_links(self, class_name):
        """
        Extracts all links from a specified container class on the page.
        
        :param class_name: CSS class of the container holding links
        :return: List of extracted links
        """
        soup = bs(self.driver.page_source, "html.parser")
        container = soup.find(class_=class_name)
        links = []
        if container:
            for a_tag in container.find_all("a", href=True):
                href = a_tag["href"]
                if not href.startswith("/"):
                    href = "/" + href
                full_link = self.config["base_url"] + href
                links.append(full_link)
        else:
            print(f"⚠️ No elements found for class: {class_name}")
        return links
    
    def get_all_category_links(self):
        """
        Extracts all category links from the homepage.
        
        :return: List of category URLs
        """
        print("🔍 Extracting category links...")
        if self.fetch_page(self.config["base_url"], self.config["categoryListClass"]):
            return self.extract_links(self.config["categoryListClass"])
        return []
    
    def extract_product_links(self, category_url):
        """
        Extracts product links from a given category page.
        
        :param category_url: URL of the category page
        """
        print(f"🔍 Extracting product links from: {category_url}")
        if self.fetch_page(category_url, self.config["pageClass"], self.product_wait_time):
            if self.config["isInfiniteScroll"]:
                self.infinite_scroll()
            else:
                self.paginate()
    
    def infinite_scroll(self):
        """
        Scrolls down an infinite scroll page and extracts product links.
        """
        scroll_pause_time = 2
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        for scroll in range(self.limit):
            print(f"🔄 Scrolling {scroll + 1}/{self.limit}")
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            
            if new_height == last_height:
                print("🚫 No more content to load.")
                break  
            
            last_height = new_height
            self.all_product_links.extend(self.extract_links(self.config["pageClass"]))
    
    def paginate(self):
        """
        Iterates through paginated product pages and extracts links.
        """
        page_count = 0
        while self.config["nextPageClass"] and page_count < self.limit:
            self.all_product_links.extend(self.extract_links(self.config["pageClass"]))
            try:
                next_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.CLASS_NAME, self.config["nextPageClass"]))
                )
                next_button.click()
                time.sleep(2)
                page_count += 1
                print(f"📄 Page {page_count}/{self.limit} loaded")
            except Exception:
                print("🚫 No more pages.")
                break
    
    def scrape(self):
        """
        Initiates the scraping process for all categories and products.
        
        :return: List of extracted product links
        """
        category_links = self.get_all_category_links()
        print(f"Found following category links for domain: {self.config['base_url']}", category_links)
        for category_url in category_links:
            self.extract_product_links(category_url)
        
        self.driver.quit()
        return self.all_product_links


# Example 1: Paginated websites

In [67]:
# Example Usage 1
site_id = "myntra"
max_pages = 2  # Example configurable max pages
scraper = WebScraper(site_id, limit=max_pages)
scraper.scrape()

🔍 Extracting category links...
🌐 Loading page: https://www.myntra.com
✅ Page loaded successfully!
Found following category links for domain: https://www.myntra.com ['https://www.myntra.com/shop/men', 'https://www.myntra.com/men-topwear', 'https://www.myntra.com/men-tshirts', 'https://www.myntra.com/men-casual-shirts', 'https://www.myntra.com/men-formal-shirts', 'https://www.myntra.com/men-sweatshirts', 'https://www.myntra.com/men-sweaters', 'https://www.myntra.com/men-jackets', 'https://www.myntra.com/men-blazers', 'https://www.myntra.com/men-suits', 'https://www.myntra.com/rain-jacket', 'https://www.myntra.com/men-ethnic-wear', 'https://www.myntra.com/men-kurtas', 'https://www.myntra.com/sherwani', 'https://www.myntra.com/nehru-jackets', 'https://www.myntra.com/dhoti', 'https://www.myntra.com/men-bottomwear', 'https://www.myntra.com/men-jeans', 'https://www.myntra.com/men-casual-trousers', 'https://www.myntra.com/men-formal-trousers', 'https://www.myntra.com/mens-shorts', 'https://www

KeyboardInterrupt: 

In [68]:
len(scraper.all_product_links)

350

In [69]:
scraper.all_product_links

['https://www.myntra.com/blazers/arrow/arrow-zero-calorie-fit-notched-lapel-collar-single-breasted-formal-blazer/31821933/buy',
 'https://www.myntra.com/shirts/rare+rabbit/rare-rabbit-men-valve-slim-fit-opaque-velvet-shirt/25588024/buy',
 'https://www.myntra.com/shirts/marks+%26+spencer/marks--spencer-abstract-printed-cuban-collar-lounge-shirt/22510276/buy',
 'https://www.myntra.com/blazers/invictus/invictus-single-breasted-slim-fit-smart-casual-blazer/28984404/buy',
 'https://www.myntra.com/shirts/snitch/snitch-men-slim-fit-opaque-striped-formal-shirt/32403421/buy',
 'https://www.myntra.com/shirts/mast+%26+harbour/mast--harbour-men-blue--white-slim-fit-striped-casual-shirt/8717979/buy',
 'https://www.myntra.com/kurtas/sangria/sangria-men-geometric-embroidered-thread-work-pure-cotton-kurta/29551132/buy',
 'https://www.myntra.com/shirts/snitch/snitch-men-smart-fit--spread-collar-textured-casual-shirt/32424756/buy',
 'https://www.myntra.com/tshirts/rigo/rigo-unisex-printed-drop-shoulder-

# Example 2: Infinite Scroll Websites

In [70]:
# Example Usage 2
site_id = "ajio"
max_scrolls = 2  # Example configurable max scrolls
scraper = WebScraper(site_id, limit=max_scrolls)
scraper.scrape()

🔍 Extracting category links...
🌐 Loading page: https://www.ajio.com
✅ Page loaded successfully!
Found following category links for domain: https://www.ajio.com ['https://www.ajio.com/shop/men', 'https://www.ajio.com/', 'https://www.ajio.com/s/clothing-4461-74581', 'https://www.ajio.com/s/footwear-4792-56591', 'https://www.ajio.com/s/accessories-4792-56591', 'https://www.ajio.com/s/latest-collections-4461-74581', 'https://www.ajio.com/s/clothing-4461-74582', 'https://www.ajio.com/s/footwear-4461-74581', 'https://www.ajio.com/s/accessories-4461-74581', 'https://www.ajio.com/shop/international-brands', 'https://www.ajio.com/s/plus-size-5061-69371', 'https://www.ajio.com/men-night-lounge-wear/c/830212', 'https://www.ajio.com/s/grooming-4384-57431', 'https://www.ajio.com/s/ethnic-and-festive-collections', 'https://www.ajio.com/s/men-kurtas-and-shirts-4390-76941', 'https://www.ajio.com/s/nehru-jackets-5170-351', 'https://www.ajio.com/men-western-wear/c/830216', 'https://www.ajio.com/men-jack

KeyboardInterrupt: 

In [71]:
len(scraper.all_product_links)

966

In [72]:
scraper.all_product_links

['https://www.ajio.com/https://www.ajio.com/s/p35-min-65-percent-off-5635-61871?ref=banner',
 'https://www.ajio.com/netplay-men-structured-checked-regular-fit-polo-t-shirt/p/443050227_beige',
 'https://www.ajio.com/gap-men-light-wash-tapered-slim-fit-jeans/p/442295920_darkblue',
 'https://www.ajio.com/the-indian-garage-co-lightly-washed-slim-fit-jeans/p/460789301_blue',
 'https://www.ajio.com/gap-men-mid-rise-regular-fit-cargo-shorts/p/442600246_black',
 'https://www.ajio.com/gap-men-heavy-wash-slim-fit-jeans/p/442295926_grey',
 'https://www.ajio.com/spykar-men-lightly-washed-slim-fit-jeans/p/469671515_midblue',
 'https://www.ajio.com/netplay-men-flat-front-cropped-fit-chinos/p/443050946_jetblack',
 'https://www.ajio.com/teamspirit-men-regular-fit-crew-neck-t-shirt/p/443061825_jetblack',
 'https://www.ajio.com/the-indian-garage-co-shirt-with-patch-pocket/p/469157902_white',
 'https://www.ajio.com/https://www.ajio.com/s/min-50-percent-off-5664-70134?ref=banner',
 'https://www.ajio.com/l