# Coral ID - Web Scraping

Live sales occur on the forums of Reef2Reef.  Pictures of frags of corals (small pieces of corals) along with their names are posted for sale.  The goal is to capture the pictures and the names to create a training set for a coral identification machine learning algorithm.

The below contains code to scrape these pictures and names from one particular thread on Reef2Reef.  In the future, this can be generalized to scrape from other WWC threads on Reef2Reef and from other vendors.

In [None]:
# imports
import argparse
import csv
import os
import re
import time

from bs4 import BeautifulSoup as bs
import requests
from urllib.parse import urlparse
from urllib.request import urlretrieve

In [None]:
# page from which to parse and download images
url = 'https://www.reef2reef.com/threads/world-wide-corals-timesplitter-live-sale-3-000-frags-our-largest-ever.719326/page-15'
# author of the posts we are pulling
poster = 'WWC-BOT'
# location of the stored images
image_loc = 'worldwidecorals.sirv.com/TSLS_20'

In [None]:
# second thread for testing
url = 'https://www.reef2reef.com/threads/world-wide-corals-tax-craze-live-sale-2300-frags-discounted-beyond-belief.552524/page-24'
poster = 'WWC'
image_loc = 'worldwidecorals.sirv.com/Tax_Craze_2019'

In [None]:
url = 'https://www.reef2reef.com/threads/wwc-spring-lightning-sale-800-frags-up-to-75-off.712595/page-13'
#url = 'https://www.reef2reef.com/threads/wwc-spring-lightning-sale-800-frags-up-to-75-off.712595/page-22'

poster = 'WWC'
image_loc = 'worldwidecorals.sirv.com/Spring_Lightning_Sale_2020'

In [None]:
# check that the given url is valid
def is_url_valid(url):
    """
    Check if the format of the given url is valid.
    Checks for scheme and netloc, then checks for status code.
    
        Parameters:
            url (string): A string containing the url
            
        Returns:
            is_valid (boolean): True if valid, else False
    """
    
    parsed = urlparse(url)
    
    is_valid = False
    if bool(parsed.netloc) and bool(parsed.scheme):
        if requests.get(url).status_code == 200:
            return True

    return is_valid

In [None]:
# get all of the images from the given url
def get_all_images(url, poster, image_loc):
    """
    Returns all image urls from the given url.
    
        Parameters:
            url (string): A string containing the url
            poster (string): A string containing the Reef2Reef author name of 
                the post, such as 'WWC-BOT'
            image_loc (string): A string containing the location of the images,
                such as 'worldwidecorals.sirv.com/TSLS_20' from the data-url 
                for the image
        
        Returns:
            image_links (list): A list of [names, image urls] from the page
    """
    
    soup = bs(requests.get(url).content, "html.parser")
    soup_div = soup.find_all('div', 
        attrs={'class':'message-userContent lbContainer js-lbContainer',
               'data-lb-caption-desc':re.compile(r'^%s'%poster)})
    
    # extract links to each of the images
    # when find an image, also extract the name for labels in training
    images=[]
    names=[]
    for tt in soup_div:
        img_found = 0
        try:
            t1 = tt.find_all('img')
            for image in t1:
                if image_loc in image['data-url']:
                    images.append('https://www.reef2reef.com' + image['src'])
                    img_found = 1
            if img_found == 1:
                t2 = tt.find_all('b')
                for name in t2:
                    names.append(name.text)
        except:
            pass
    names_trimmed = names[0::6]
    
    # if the lengths of the two lists are the same, zip together
    if len(names_trimmed) == len(images):
        image_links = [list(i) for i in zip(names_trimmed, images)]
    else:
        image_links = []
        print('WARNING: images and names are not the same length',
              '\n', 'no image_links this page')
    
    return image_links

In [None]:
def download_images(image_links):
    """
    Downloads all images provided in the list of image names and urls.
    
        Parameters:
            image_links (list): A list of [names, image urls]
    """

    for i in image_links:
        r = requests.get(i[1])        
        filename = os.path.join('./scraped_images/', 
                                urlparse(i[1]).query.split('%2F')[-1].split('.jpg')[0]+'.jpg')
        with open(filename, 'wb') as outfile:
            outfile.write(r.content)

In [None]:
# add the names of the corals and the corresponding filename to a csv file for later use
def output_names_files(image_links):
    """
    Outputs a csv file with the coral name and filename.
    
        Parameters:
            image_links (list): A list of [names, image urls]
        
        Returns:
            coral_names_files.csv (file): 
    """
    
    with open('coral_names_files.csv', 'a', newline='') as f:
        writer = csv.writer(f, delimiter='|')
        writer.writerows(image_links)

In [None]:
output_names_files(image_links)

In [None]:
# determine the total number of pages in this forum thread
def get_num_pages(url):
    """
    Returns the number of pages in this thread of the forum.
    
        Parameters:
            url (string): A string containing the url
        
        Returns:
            num_pages (int): The number of pages
    """
    
    soup = bs(requests.get(url).content, "html.parser")
    nums=[]
    soup_ul = soup.find_all('ul', attrs={'class':'pageNav-main'})
    for tt in soup_ul:
        t1 = tt.find_all('a')
        for num in t1:
            try:
                nums.append(int(num.text))
            except:
                pass
    
    num_pages = max(nums)
    return num_pages

In [None]:
get_num_pages(url)

In [None]:
# loop through all of the pages and get images from the entire forum thread
def main(url, poster, image_loc):
    """
    Loops through the pages in this thread of the forum to get all images.
    
        Parameters:
            url (string): A string containing the url with the starting page
                for scraping
            poster (string): A string containing the Reef2Reef author name
                of the post, such as 'WWC-BOT'
            image_loc (string): A string containing the location of the
                images, such as 'worldwidecorals.sirv.com/TSLS_20' from the
                data-url for the image
    """
    
    # find first page, last page, and base url
    if is_url_valid(url):
        last_page = get_num_pages(url)
        base_url, first_page  = url.split('/page-')
        first_page = int(first_page)
        
    # loop through pages from first to last, pausing periodically
    print('Beginning scrape at page ', first_page)
    for i in range(first_page, last_page+1):
        # implement periodic pause
        if i % 20 == 0:
            print('pausing for 30 seconds at page ', i)
            time.sleep(30)
        # get url for the current page of the forum and scrape
        current_url = base_url + '/page-' + str(i)
        if is_url_valid(current_url):
            image_links = get_all_images(current_url, poster, image_loc)
            if image_links == []:
                print('page ', i, ' has no image links')
            download_images(image_links)
            output_names_files(image_links)
        else:
            print('WARNING: page ', i, ' has invalid url')
    print('End of scrape.')