<a href="https://colab.research.google.com/github/1998x-stack/Colab/blob/main/clcindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install fake-useragent



In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from fake_useragent import UserAgent

class CategoryScraper:
    """
    A class to scrape categories from the CLC Index website using multi-threading.

    Attributes:
        base_url (str): The base URL of the CLC Index website.
        data (dict): The dictionary to store scraped category data.
        visited (set): A set to keep track of already visited category numbers.
        save_interval (int): The number of pages to scrape before saving data to file.
        ua (UserAgent): Fake User-Agent instance to randomize headers for requests.
    """

    def __init__(self, base_url: str = 'https://www.clcindex.com/category/', save_interval: int = 20, max_retries: int = 5):
        """
        Initializes the CategoryScraper class with the base URL and save interval.

        Args:
            base_url (str): The base URL of the CLC Index website.
            save_interval (int): The number of pages to scrape before saving data.
            max_retries (int): The number of retry attempts in case of an error.
        """
        self.base_url = base_url
        self.data = {}
        self.visited = set()
        self.save_interval = save_interval
        self.page_counter = 0
        self.ua = UserAgent()  # Create a UserAgent instance
        self.max_retries = max_retries

    def save_data(self) -> None:
        """
        Saves the current scraped data to a local JSON file every save_interval pages.
        """
        filename = 'category_data.json'
        # Sort the dictionary by key before saving
        sorted_data = {key: self.data[key] for key in sorted(self.data)}
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(sorted_data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename} at page {self.page_counter}")

    def scrape_page(self, url: str) -> None:
        """
        Scrapes category data from a specific URL and recursively visits the next page.

        Args:
            url (str): The URL of the page to scrape.
        """
        retries = 0
        while retries < self.max_retries:
            try:
                # Prepare headers with a random User-Agent string
                headers = {
                    'User-Agent': self.ua.random
                }

                # Send GET request with the headers
                response = requests.get(url, headers=headers)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract category numbers and names
                rows = soup.select('table tbody tr')  # Get rows from table
                for row in rows:
                    category_number = row.select('td:nth-child(2)')  # Get category number
                    category_name = row.select('td:nth-child(3)')    # Get category name

                    # Ensure category number and name are extracted
                    if category_number and category_name:
                        cat_number = category_number[0].text.strip()
                        cat_name = category_name[0].text.strip()

                        # Ensure the category number is not visited before
                        if cat_number not in self.visited:
                            self.visited.add(cat_number)
                            # Add to the data dictionary
                            if cat_number not in self.data:
                                self.data[cat_number] = [cat_name]

                            # Update page counter
                            self.page_counter += 1
                            if self.page_counter % self.save_interval == 0:
                                self.save_data()  # Save data every 20 pages

                            # Recursively scrape the next page
                            next_page_link = f"{self.base_url}{cat_number}/"
                            self.scrape_page(next_page_link)

                break  # If the scraping succeeds, exit the retry loop
            except Exception as e:
                retries += 1
                print(f"Error while scraping {url}: {e}. Retry attempt {retries}/{self.max_retries}")
                time.sleep(2 ** retries)  # Exponential backoff before retrying

    def run(self, start_url: str = None) -> None:
        """
        Starts the scraping process from the given start URL.

        Args:
            start_url (str): The starting URL for scraping.
        """
        if start_url is None:
            start_url = self.base_url  # Default start URL if not provided
        print(f"Starting to scrape from {start_url}")
        self.scrape_page(start_url)


def main() -> None:
    """
    Main function to execute the category scraping process using multithreading.
    """
    scraper = CategoryScraper()

    # Use ThreadPoolExecutor to speed up the scraping process
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        # Start multiple threads for scraping pages
        for _ in range(10):  # Start 10 threads
            futures.append(executor.submit(scraper.run))

        # Wait for all threads to complete
        for future in as_completed(futures):
            future.result()

    scraper.save_data()
    print("Scraping completed!")


if __name__ == "__main__":
    main()

Starting to scrape from https://www.clcindex.com/category/Starting to scrape from https://www.clcindex.com/category/

Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Starting to scrape from https://www.clcindex.com/category/
Data saved to category_data.json at page 20
Data saved to category_data.json at page 40
Data saved to category_data.json at page 60
Data saved to category_data.json at page 80
Data saved to category_data.json at page 100
Data saved to category_data.json at page 120
Data saved to category_data.json at page 140
Data saved to category_data.json at page 161
Data saved to category_data.json at page 180
Data save