In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.robotparser as urobot
import urllib.request
import pandas as pd
import numpy as np

## Scanning Robot.txt using Robot Parser

In [2]:
def getRobotParser(robo_url):
    rp = urobot.RobotFileParser()
    rp.set_url(robo_url)
    rp.read()
    return rp

## Scanning Sitemap

In [3]:
def get_urls_from_sitemap(sitemap_url:str, robot_parser:urobot.RobotFileParser = None):
    resp = requests.get(sitemap_url)
    sitemap = BeautifulSoup(resp.content, 'xml')
    urls = [url.text for url in sitemap.find_all('loc')]
    print(f"Number of URLs: {len(urls)}")
    if robot_parser:
        # Filtering URLs using Robot Parser
        urls = list(filter(lambda url: robot_parser.can_fetch("*", url), urls))
        print(f"Number of URLs after parsing through robot parser: {len(urls)}")
    return urls

## Categorising Sitemap URLs
In this case, the category sitemap is scanned to get all URLs and provide categories it belongs to

In [4]:
def get_categorised_urls_from_sitemap(sitemap_url:str, robo_parser=None):  
    urls = get_urls_from_sitemap(sitemap_url, robo_parser)
    # Categorising URLs based on parts of the URL path
    urls_df = pd.DataFrame(columns=["Type", "Category", "SubCategory", "URL"])
    for url in urls:
        url_parts = url.split("/")[3:] # First 3 will include HTTP protocol and host name
        temp_dict = {"Type":url_parts[0] if len(url_parts)>0 else np.nan,
                     "Category":url_parts[1] if len(url_parts)>1 else np.nan,
                     "SubCategory":url_parts[2] if len(url_parts)>2 and not url_parts[2].isdigit() else np.nan,
                     "URL":url}
        urls_df = pd.concat([urls_df, pd.DataFrame(temp_dict, index=[0])], ignore_index=True)
    return urls_df

In [9]:
if __name__=="__main__":
    print("*** Parsing Robot.txt ***")
    robo_url = "https://groceries.asda.com/robots.txt"
    robo_parser = getRobotParser(robo_url)
    print(f"Host: {robo_parser.host} | Sitemaps: {robo_parser.sitemaps}")
    
    # Get URLs from index sitemap
    print("\n*** Scanning Main Sitemap ***")
    sitemaps = get_urls_from_sitemap(robo_parser.sitemaps[0])
    print(f"Number of sitemaps found: {len(sitemaps)}")
    print(sitemaps)
    
    # Get URLs from category sitemap
    print("\n*** Scanning Category Sitemap ***")
    sitemap_url= 'https://groceries.asda.com/sitemap-category.xml'
    urls_df = get_categorised_urls_from_sitemap(sitemap_url, robo_parser)

*** Parsing Robot.txt ***
Host: groceries.asda.com | Sitemaps: ['https://groceries.asda.com/sitemap-index.xml']

*** Scanning Main Sitemap ***
Number of URLs: 6
Number of sitemaps found: 6
['https://groceries.asda.com/sitemap-category.xml', 'https://groceries.asda.com/sitemap-event-pages.xml', 'https://groceries.asda.com/sitemap-products.xml', 'https://groceries.asda.com/sitemap-recipes-categories.xml', 'https://groceries.asda.com/sitemap-recipes.xml', 'https://groceries.asda.com/sitemap-special-offers.xml']

*** Scanning Category Sitemap ***
Number of URLs: 3705
Number of URLs after parsing through robot parser: 3705
