In [1]:
!pip install requests beautifulsoup4


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import requests

# URLs of robots.txt files
urls = {
    "Amazon": "https://www.amazon.com/robots.txt",
    "ESPNCricInfo": "https://www.espncricinfo.com/robots.txt",
    "Instagram": "https://www.instagram.com/robots.txt"
}

# Function to fetch robots.txt content
def fetch_robots(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return f"Failed to retrieve {url} (Status Code: {response.status_code})"

# Fetch and store robots.txt contents
robots_data = {site: fetch_robots(url) for site, url in urls.items()}

# Print results
for site, data in robots_data.items():
    print(f"--- {site} robots.txt ---\n")
    print(data[:500])  # Print only the first 500 characters for readability
    print("\n" + "="*50 + "\n")


--- Amazon robots.txt ---

User-agent: *
Disallow: /exec/obidos/account-access-login
Disallow: /exec/obidos/change-style
Disallow: /exec/obidos/flex-sign-in
Disallow: /exec/obidos/handle-buy-box
Disallow: /exec/obidos/tg/cm/member/
Disallow: /gp/aw/help/id=sss
Disallow: /gp/cart
Disallow: /gp/flex
Disallow: /gp/product/e-mail-friend
Disallow: /gp/product/product-availability
Disallow: /gp/product/rate-this-item
Disallow: /gp/sign-in
Disallow: /gp/reader
Disallow: /gp/sitbv3/reader
Disallow: /gp/richpub/syltguides/create
D


--- ESPNCricInfo robots.txt ---

Failed to retrieve https://www.espncricinfo.com/robots.txt (Status Code: 403)


--- Instagram robots.txt ---

# Notice: Collection of data on Instagram through automated means is
# prohibited unless you have express written permission from Instagram
# and may only be conducted for the limited purpose contained in said
# permission.
# All authorized user-agents listed on this page must comply with Meta’s
# Automated Data Collection Te

In [3]:
import re

# Function to parse robots.txt
def parse_robots(txt):
    rules = {"User-agent": [], "Disallow": [], "Allow": [], "Crawl-delay": None}
    
    for line in txt.split("\n"):
        line = line.strip()
        if line.startswith("User-agent:"):
            rules["User-agent"].append(line.split(":")[1].strip())
        elif line.startswith("Disallow:"):
            rules["Disallow"].append(line.split(":")[1].strip())
        elif line.startswith("Allow:"):
            rules["Allow"].append(line.split(":")[1].strip())
        elif line.startswith("Crawl-delay:"):
            rules["Crawl-delay"] = line.split(":")[1].strip()
    
    return rules

# Parse the fetched robots.txt files
parsed_robots = {site: parse_robots(data) for site, data in robots_data.items()}

# Print summary
for site, rules in parsed_robots.items():
    print(f"--- {site} Analysis ---")
    print(f"User-agents: {rules['User-agent'][:5]}")  # Show first 5 user-agents
    print(f"Disallowed Paths: {rules['Disallow'][:5]}")  # Show first 5 disallowed paths
    print(f"Allowed Paths: {rules['Allow'][:5]}")  # Show first 5 allowed paths
    print(f"Crawl-delay: {rules['Crawl-delay']}")
    print("\n" + "="*50 + "\n")


--- Amazon Analysis ---
User-agents: ['*', 'EtaoSpider', 'GPTBot', 'CCBot']
Disallowed Paths: ['/exec/obidos/account-access-login', '/exec/obidos/change-style', '/exec/obidos/flex-sign-in', '/exec/obidos/handle-buy-box', '/exec/obidos/tg/cm/member/']
Allowed Paths: ['/wishlist/universal*', '/wishlist/vendor-button*', '/wishlist/get-button*', '/gp/wishlist/universal*', '/gp/wishlist/vendor-button*']
Crawl-delay: None


--- ESPNCricInfo Analysis ---
User-agents: []
Disallowed Paths: []
Allowed Paths: []
Crawl-delay: None


--- Instagram Analysis ---
User-agents: ['Amazonbot', 'Applebot-Extended', 'ClaudeBot', 'Google-Extended', 'GPTBot']
Disallowed Paths: ['/', '/', '/', '/', '/']
Allowed Paths: []
Crawl-delay: None




In [4]:
from bs4 import BeautifulSoup

# Example function to fetch and parse Terms of Use
def fetch_terms(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()[:1000]  # Print first 1000 characters for readability
    else:
        return f"Failed to retrieve {url} (Status Code: {response.status_code})"

terms_urls = {
    "Amazon": "https://sell.amazon.in/bn/standards/terms-of-use",
    "ESPNCricInfo": "https://www.espncricinfo.com/ci/content/site/company/terms_use.html",
    "Instagram": "https://help.instagram.com/581066165581870"
}

terms_data = {site: fetch_terms(url) for site, url in terms_urls.items()}

# Print summary
for site, data in terms_data.items():
    print(f"--- {site} Terms of Use ---\n")
    print(data[:1000])  # Print first 1000 characters
    print("\n" + "="*50 + "\n")


--- Amazon Terms of Use ---







Terms of Use | Sell on Amazon

















Seller LoginBackStartBackSell onlineHow to register as a sellerHow to sell on Amazon.inHow to list a productOffers for new sellersGrowBackGrow your businessGrow your businessTools to grow your businessAmazon selling programsSell globallyService Provider NetworkLaunch your brandPricingBackPricingType of feesCompare fulfilment channelsResourcesBackResourcesLearnGet helpSell on Amazon blogsStart Selling✕Select your preferred languageEnglish - INहिंदी - INதமிழ் - INಕನ್ನಡ - INमराठी - INગુજરાતી - INবাংলা - INമലയാളം - INతెలుగు - INTerms of UseWelcome to the website sell.amazon.in ("Sell on Amazon"). The website Sell on Amazon is operated by Amazon Seller Services Private Limited ("Amazon" or "us" or "we" or "our"), having its registered office located 8th Floor, Brigade Gateway 26/1 Dr. Rajkumar Road Bangalore – 560055, Karnataka, India. Please read the Conditions of Use document carefully before using the Sell 

In [5]:
with open("robots_analysis.txt", "w") as file:
    for site, rules in parsed_robots.items():
        file.write(f"--- {site} Analysis ---\n")
        file.write(f"User-agents: {rules['User-agent'][:5]}\n")
        file.write(f"Disallowed Paths: {rules['Disallow'][:5]}\n")
        file.write(f"Allowed Paths: {rules['Allow'][:5]}\n")
        file.write(f"Crawl-delay: {rules['Crawl-delay']}\n")
        file.write("="*50 + "\n")
