# 1: Get images and text descriptions from Mushroom.World

References
* https://hackersandslackers.com/scraping-urls-with-beautifulsoup/
* https://wodan.xyz/python-how-to-download-all-the-images-from-the-website/
* https://github.com/Msalmannasir/Google_image_scraper

In [1]:
import os
import sys
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [None]:
#----------------------------------------------------#
#                   SETTINGS
#----------------------------------------------------#
# Start URL, adjust to our needs
url = "http://www.mushroom.world/mushrooms/namelist"
tld = "http://www.mushroom.world/"
save_directory = "./mushie_image_data/"

# Spoofed HTTP headers to give us access to the page
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [None]:
#----------------------------------------------------#
#                   SCRAPE_SHROOM
#       Takes a Mushroom World mushroom URL
#       Scrapes text data, Saves to CSV
#       Scrapes images, saves locally
#----------------------------------------------------#
def scrape_shroom(url):
    # First, download page HTML for parsing
    req = requests.get(url, headers)
    soup = bs(req.content, 'html.parser')

    # Create the Dataframe to store all the info
    df = pd.DataFrame(columns=['latin_name',
                                'english_name',
                                'edibility',
                                'filename',
                                'mushroomworld_url',
                                'image_url'])

    #----------- TEXT DATA WRANGLING ----------#
    # Get the name, both english and latin
    name = soup.find("div", class_="caption").get_text().lstrip()
    try:
        # Now split them into two vars and get rid of all the junk
        latin_name, english_name = name.split("(")
        latin_name = latin_name.rstrip()
        english_name = english_name.rstrip().rstrip(")")
    except:
        latin_name = name
        english_name = "N/A"

    # Get the edibility (it's always the fourth attribute on the information header)
    edibility = soup.find_all("div", class_="textus")[3].get_text()
    try:
        edibility, _ = edibility.split(" (")
    except:
        pass

    #----------- IMAGE DATA WRANGLING ----------#
    # Extract all the "swipebox" elements that contain image references
    href_list = soup.findAll("a", class_="swipebox")

    # Now download each of these images locally
    for image in href_list:
        # First, concatenate the image filename with the TLD
        # To get the image URL on Mushroom.World
        img_url = image.get("href").lstrip('/..')
        img_url = tld + img_url

        # Next, get just the image filename without any URL business
        img_filename = image.get("href").lstrip('/../data/fungi/')

        # Now try to download the image from Mushroom.World
        try:
            response = requests.get(img_url)
            if response.status_code == 200:
                with open(save_directory + img_filename, 'wb') as f:
                    f.write(requests.get(img_url).content)
                    f.close()
        except:
            pass

        # Now we have to save all that image data to a data frame
        df = df.append({ 'latin_name' : latin_name,
                         'english_name' : english_name,
                         'edibility' : edibility,
                         'filename' : img_filename,
                         'mushroomworld_url' : url,
                         'image_url' : img_url
            },
            sort=False,
            ignore_index=True
            )
        # END FOR

    return df
    #END SCRAPE_SHROOM

### Driver

In [None]:
#----------------------------------------------------#
#                    MAIN DRIVER
#----------------------------------------------------#
# Check that the file doesn't exist first before making it
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Create the Dataframe to store all the info
df = pd.DataFrame(columns=['latin_name',
                            'english_name',
                            'edibility',
                            'filename',
                            'mushroomworld_url',
                            'image_url'])

# Get the URL to each mushroom page
first_req = requests.get(url, headers)
first_soup = bs(first_req.content, 'html.parser')
mushroom_list = first_soup.find_all("a")

# For each URL, scrape the desired information using our function
for mushroom in mushroom_list:
    # But make sure that the URL is one we want!
    if tld in mushroom.get("href"):
        df_out = scrape_shroom(mushroom.get("href"))
        df = df.append(df_out)

In [None]:
df = pd.read_csv("./mushie_image_data/_scraped_data.csv")

In [None]:
df = df.replace('\n','', regex=True)

In [None]:
df.head(30)

In [None]:
df.info()

---
659 images isn't going to be nearly enough to train a computer vision model on.

However, now that we have species names and edibility information, it's possible to simply search through Google Images and download more photos of each species.

---
# 2: Scrape more images from Google Images

In [None]:
# Get a list of the unique latin names in the dataframe, as well as each one's edibility rating
# This can be the mapping we use to search for and add new images to the dataset
df_uniq = df[['latin_name', 'edibility']].copy()
df_uniq.drop_duplicates(subset='latin_name', keep='first', inplace=True)

In [None]:
display(df_uniq)

We'll use Selenium to create a Google Images scraper. If you want to run this on your device, you'll need to download the appropriate [ChromeDriver executable](http://chromedriver.chromium.org/downloads) for your OS and Chrome version, then place the executable in the "dataset" folder of this repo's filetree. 

This will allow Selenium to programmatically navigate within the Chrome browser, so you don't have to do any manual clicking.

In [None]:
from selenium import webdriver
import shutil
import time

def get_from_gimages(name, edibility):
    # Replace any spaces in the name with + to match the google image URL query format 
    mushie_name_url = name.replace(" ", "+")
    url = 'https://www.google.com/search?q=' + mushie_name_url + '&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947'

    # Make selenium open up a chrome page with the URL we built, give it time to load
    driver = webdriver.Chrome('./chromedriver')
    driver.get(url)
    time.sleep(1)
    
    #make a dataframe to pass the image metadata back out
    df = pd.DataFrame(columns=['latin_name',
                                'english_name',
                                'edibility',
                                'filename',
                                'mushroomworld_url',
                                'image_url'])
    
    #get 100 images from the search page
    for j in range (1, 120):
        print("{}: {}".format(name, j))
        #google puts "related searches" boxes every 25th element
        #we want to avoid those because they screw up the crawl
        if j%25 == 0:
            continue
        try:
            #click on the image and get its source
            imgurl = driver.find_element_by_xpath('//div//div//div//div//div//div//div//div//div//div['+str(j)+']//a[1]//div[1]//img[1]')
            imgurl.click()
            time.sleep(1)
            img = driver.find_element_by_xpath('//body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
            # try to download each image from the source
            try:
                response = requests.get(img, stream=True)
                filename = '{}{}_gimages.jpg'.format(name.replace(" ", ""), j)
                image_save_path = os.path.join(save_directory, filename)
                with open(image_save_path, 'wb') as file:
                    shutil.copyfileobj(response.raw, file)

                # Now we have to save all that image data to a data frame
                df = df.append({ 'latin_name' : name,
                                 'edibility' : edibility,
                                 'filename' : filename,
                                 'image_url' : img
                    },
                    sort=False,
                    ignore_index=True
                              )
            except:
                print("something went wrong with downloading image")
                pass
        except:
            print("something went wrong with clicking image")
    #end_for
    return df
#end

In [None]:
# Call the above function for each species of shroom
for _, row in df_uniq.iterrows():
    new_img_df = get_from_gimages(row['latin_name'], row['edibility'])
    
    #then concatenate the new image data into the df
    df = pd.concat([df, new_img_df], ignore_index=True)
    
    display(df)

In [None]:
# Now save the data file
df.to_csv("./mushie_image_data/_scraped_data.csv")