In [1]:
#import required libraries

# full imports go first
import os # os is a built-in Python library for interacting with the operating system
import requests # requests is a library for managing requests

# partial imports go below
from bs4 import BeautifulSoup # add to requirements.txt: beautifulsoup4
from PIL import Image # add to requirements.txt: pillow

In [None]:
### GETTING ALL IMAGES FROM A PAGE

In [40]:
URL = "https://en.wikipedia.org/wiki/Narwhal"
response = requests.get(URL)
bs_html = BeautifulSoup(response.text, "html.parser")

In [None]:
# preview the html contents
print(bs_html.prettify())

In [5]:
#finds all the
image_tags = bs_html.find_all('img')

In [6]:
accepted_image_types = ['jpg', 'jpeg', 'png', 'bmp', 'webp', 'svg']

In [42]:
file_types = []

# get the source 'src' of the image and split it to only leave the file type
for img_tag in image_tags:
  file_types.append(str(img_tag.get('src').split('.')[-1]))


present_file_types = set(file_types)
# sets do not allow duplicates meaning we will be left with one of each file type present

In [None]:
# what image types can we find on this page? Please note there may be some obsolete data in here
present_file_types

In [None]:
image_tags

In [None]:
# how many tags?
len(image_tags)

In [11]:
# extract the image URLs from the tags
# create an empty list
img_urls = []

for img_tag in image_tags:

    # if the tag has the 'src' property
    if img_tag.get('src'):

        # extract it
        img_src = img_tag.get('src')

        # get the image type (it's the last bit of text after the '.')
        img_type = img_src.split('.')[-1]

        # skip further steps in the loop for the current image if not in the accepted types
        if img_type.lower() in accepted_image_types and img_src.startswith('//upload'):
            img_urls.append(f'https:{img_src}')

In [None]:
image_tags[5].get('src')

In [None]:
# how many URLs did we get?
len(img_urls)

In [None]:
# if you preview, quite a lot of them are duplicated
img_urls

In [None]:
# how many unique?
unique_urls = set(img_urls)
len(unique_urls)

In [None]:
# name of the folder where we want to sve the images. CAPITALS suggest it's a constant
IMAGES_DIRECTORY = "scraped_images"

current_dirs = os.listdir() # this function lists all the contents of the current folder (where the notebook is)
current_dirs

In [18]:
# if the folder where we want to save the images is not already there, create it
if IMAGES_DIRECTORY not in current_dirs:
    os.mkdir(IMAGES_DIRECTORY)  # this directory will be created in the same location where your notebook is

In [19]:
errors = []

requests.adapters.DEFAULT_RETRIES = 10

# the "enumerate" function allows for iteration while also supplying an index for each item
for img_index, img_url in enumerate(unique_urls):

    # get the data from the image url
    resp = requests.get(img_url, stream=True)

    # if the request is not completed
    if resp.status_code != 200:
        # add the image url to the errors list
        errors.append(img_url)

    # otherwise, proceed
    else:
        # create a PIL.Image object
        obj_img = Image.open(resp.raw)
        # get the file extension from the url
        img_type = img_url.split('.')[-1]
        # save the image in its origial extension
        obj_img.save(f'./{IMAGES_DIRECTORY}/img_{img_index}.{img_type}')

In [None]:
# how many errors?
len(errors)

In [None]:
# let's see what's happened here!
errors[0]

In [22]:
resp = requests.get(errors[0], stream=True)

In [None]:
resp.status_code # 403 is the status code for "Permission denied"