# Coral ID

Scrape one page of the Reef2Reef forums to get images of corals.

In [1]:
# imports
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
from urllib.request import urlretrieve
import re
import os

In [2]:
# page from which to parse and download images
url = 'https://www.reef2reef.com/threads/world-wide-corals-timesplitter-live-sale-3-000-frags-our-largest-ever.719326/page-15'

In [3]:
# check that the given url is valid
def is_url_valid(url):
    """
    Check if the format of the given url is valid.
    
        Parameters:
            url (string): A string containing the url
            
        Returns:
            boolean: True if url contains both scheme and netloc, else False
    """
    
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [4]:
# get all of the images from the given url
# currently hardcoded for posts by 'WWC-BOT' and images stored in worldwidecorals.sirv.com/TSLS_20
def get_all_images(url):
    """
    Returns all image urls from the given url.
    
        Parameters:
            url (string): A string containing the url
        
        Returns:
            image_links (list): A list of [names, image urls] from the page
    """
    
    soup = bs(requests.get(url).content, "html.parser")
    #set([tag.name for tag in soup.find_all()])
    
    # extract names of each image
    names=[]
    temp3 = soup.find_all('div', 
                          attrs={'class':'message-userContent lbContainer js-lbContainer',
                                 'data-lb-caption-desc':re.compile(r'^WWC-BOT')})
    for tt in temp3:
        t1 = tt.find_all('b')
        for name in t1:
            names.append(name.text)
    names_trimmed = names[0::6]
    
    # extract links to each of the images
    images=[]
    temp3 = soup.find_all('div', 
                          attrs={'class':'message-userContent lbContainer js-lbContainer',
                                 'data-lb-caption-desc':re.compile(r'^WWC-BOT')})
    for tt in temp3:
        t1 = tt.find_all('img')
        for image in t1:
            if 'worldwidecorals.sirv.com/TSLS_20/' in image['data-url']:
                pos = image['data-url'].index("?")
                images.append(image['data-url'][:pos])
    
    # check that the lengths of the two lists are the same and if so zip together
    if len(names_trimmed) == len(images):
        image_links = [list(i) for i in zip(names_trimmed, images)]
    
    return image_links

In [5]:
image_links = get_all_images(url)

In [14]:
def download_images(image_links):
    """
    Downloads all images provided in the list of image names and urls.
    
        Parameters:
            image_links (list): A list of [names, image urls]
    """

    for i in image_links:
        filename = os.path.join('./scraped_images/', urlparse(i[1]).path.split('/')[-1])
        urlretrieve(i[1], filename)

In [15]:
download_images(image_links)