In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

def is_valid(url):
    """Checks if the given URL is a valid HTTP or HTTPS URL."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and parsed.scheme in ("http", "https")

def get_all_resources(url, download_path="downloaded_resources"):
    """
    Fetches the HTML of a URL and extracts links to various resources.
    Optionally downloads some of these resources.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        base_url = urljoin(url, '/')
        resources = set()

        # Find links in <a> tags
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(base_url, link['href'])
            if is_valid(absolute_url):
                resources.add(absolute_url)

        # Find links to images
        for img in soup.find_all('img', src=True):
            absolute_url = urljoin(base_url, img['src'])
            if is_valid(absolute_url):
                resources.add(absolute_url)

        # Find links to scripts
        for script in soup.find_all('script', src=True):
            absolute_url = urljoin(base_url, script['src'])
            if is_valid(absolute_url):
                resources.add(absolute_url)

        # Find links to stylesheets
        for link in soup.find_all('link', rel='stylesheet', href=True):
            absolute_url = urljoin(base_url, link['href'])
            if is_valid(absolute_url):
                resources.add(absolute_url)

        # Find links to other common resources (you can extend this list)
        for source in soup.find_all('source', src=True):
            absolute_url = urljoin(base_url, source['src'])
            if is_valid(absolute_url):
                resources.add(absolute_url)
        for iframe in soup.find_all('iframe', src=True):
            absolute_url = urljoin(base_url, iframe['src'])
            if is_valid(absolute_url):
                resources.add(absolute_url)
        for audio in soup.find_all('audio', src=True):
            absolute_url = urljoin(base_url, audio['src'])
            if is_valid(absolute_url):
                resources.add(absolute_url)
        for video in soup.find_all('video', src=True):
            for source in video.find_all('source', src=True):
                absolute_url = urljoin(base_url, source['src'])
                if is_valid(absolute_url):
                    resources.add(absolute_url)
            if video.has_attr('src') and is_valid(urljoin(base_url, video['src'])):
                resources.add(urljoin(base_url, video['src']))

        print(f"Found the following resources on {url}:")
        for resource in sorted(list(resources)):
            print(resource)

        # Optional: Download the found resources
        if download_path:
            os.makedirs(download_path, exist_ok=True)
            print(f"\nAttempting to download some resources to: {download_path}")
            for resource_url in resources:
                try:
                    resource_response = requests.get(resource_url, stream=True)
                    resource_response.raise_for_status()

                    parsed_url = urlparse(resource_url)
                    filename = os.path.join(download_path, os.path.basename(parsed_url.path))

                    with open(filename, 'wb') as f:
                        for chunk in resource_response.iter_content(chunk_size=8192):
                            f.write(chunk)
                    print(f"Downloaded: {os.path.basename(filename)}")
                except requests.exceptions.RequestException as e:
                    print(f"Error downloading {resource_url}: {e}")
                except Exception as e:
                    print(f"An error occurred while processing {resource_url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    target_url = 'https://thestudyvarta.blogspot.com/'
    download = input("Do you want to download some of the found resources? (yes/no): ").lower()
    download_path = "downloaded_resources" if download == "yes" else None
    get_all_resources(target_url, download_path)

Found the following resources on https://thestudyvarta.blogspot.com/:
https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js
https://blogger.googleusercontent.com/img/a/AVvXsEgaty7CB8OkQW5FIjoNvEtePC4zwLc23C7d0koGKqRZjcjT_98aNsqr3WoUPpzGZUhBAMDla1c5YJjD9jSuoKpBw0069qQ9gd_xyM_2dxHJGErvi21m3BZmDUud9vqr1DDDdE8lO1A_p9XdShk21a2euIN5ltrMBALkiXDOGxejZW5XQRmPdHawZSEH=s793
https://blogger.googleusercontent.com/img/a/AVvXsEjpKtnXSRz-oxBIbeGTtfdi5Dpn_qXShEXRIVm1tN5E976Us57s7rJ5OO53zdaUdKecZP729qpWgZ0O4EQDJIt_JOmcGsV_ddwtAAVKJq9wRtWb5F1Tbe4evSLTT5rS3LLF018BzLKb7UMt1_t80HEXcOQtt7FG-Doo93hARplLW2lZ-AJeGLTX9c5k=s793
https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhwJSBqchgQikg5ytZNHl1PTUfEDWxiZYok4bQMwDKXWb_t71JSQgpB7WINK0wviKBM6o4IKrnhYiSoSwi-AViFAGp_Fp4VdMP4qjholRFold0G6w6Yi-aZfhoEeatrm89jun5QPhJHV5oqehfnkQTMDXBf_Gehb04k0OUurMJPVlBeDWTWRRVtdl3jJzM/w640/carlos-muza-hpjSkU2UYSU-unsplash.jpg
https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi1-fp1HQbM36ALo-HY7u7q9mPJnN5q

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def is_valid(url):
    """Checks if the given URL is a valid HTTP or HTTPS URL."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and parsed.scheme in ("http", "https")

def get_unused_resources(url):
    """Identifies unused resources on a webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        base_url = urljoin(url, '/')
        resources = set()

        # Extract all resources
        for tag, attr in [('a', 'href'), ('img', 'src'), ('script', 'src'), ('link', 'href'), ('source', 'src'), ('iframe', 'src'), ('audio', 'src'), ('video', 'src')]:
            for element in soup.find_all(tag, **{attr: True}):
                absolute_url = urljoin(base_url, element[attr])
                if is_valid(absolute_url):
                    resources.add(absolute_url)

        # Check if resources are used in the HTML
        unused_resources = []
        html_content = response.text
        for resource in resources:
            if resource not in html_content:
                unused_resources.append(resource)

        print(f"Unused resources on {url}:")
        for unused in unused_resources:
            print(unused)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    target_url = 'https://thestudyvarta.blogspot.com/'
    get_unused_resources(target_url)

Unused resources on https://thestudyvarta.blogspot.com/:
https://fonts.googleapis.com/css?family=Open+Sans:400,400i,700,700i
https://thestudyvarta.blogspot.com/.https:/www.facebook.com/Deeprajsrivastav9119
https://thestudyvarta.blogspot.com/search?updated-max=2023-05-27T02:46:00-07:00&max-results=7
https://netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css
