In [None]:
from bs4 import BeautifulSoup
import requests
import random
import math

# Mushroom world scraper:
---

In [None]:
def mw_scraper(m_type, pics_list, page_range):
  '''
  Scrapes picture links from results of search on mushroom.world 
  and adds results to list

  Arguments:
    m_type: str, edible or inedible
    pics_list: empty list to store image results
    page_range: range(0, number of pages of search results for m_type)
  '''
  for page in page_range:
    # iterate through each page of search results and generate appropriate url
    if page == 0:
      url='http://www.mushroom.world/mushrooms/' + m_type
    else:
      url = 'http://www.mushroom.world/mushrooms/'+ m_type + '?page=' + str(page)
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    # get all 'img' tags and save to list
    details = [tag for tag in soup.find_all('img', {'alt':'Picture'}, src=True)]
    pics_list += details

In [None]:
# set up lists to hold results of web scrape
mw_ed_image_details, mw_ined_image_details = [], []

# run each results page through the scraper
mw_scraper('edible', mw_ed_image_details, range(0,8))
mw_scraper('poisonous', mw_ined_image_details, range(0,4))

In [None]:
mw_ed_image_details[0]

<img alt="Picture" src="/../data/fungi/Agaricusarvensis1-thumb.JPG"/>

In [None]:
# strip unnecessary characters and add address for each image
mw_ed_links = [str(link).strip('<img alt="Picture" src="/.."/>') for link in mw_ed_image_details] 
mw_ed_links = ['http://www.mushroom.world/' + link for link in mw_ed_links]
mw_ined_links = [str(link).strip('<img alt="Picture" src="/.."/>') for link in mw_ined_image_details]
mw_ined_links = ['http://www.mushroom.world/' + link for link in mw_ined_links]

# WildFood scraper:
---

In [None]:
# get all links to mushroom detail pages on the results page 
# for edible mushrooms on wildfooduk.com
url = 'https://www.wildfooduk.com/mushroom-guide/?mushroom_type=edible'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
wf_edible_links = [tag for tag in soup.find_all('td', {'class':'mushroom-image'})]

# repeat for poisonous mushrooms
url = 'https://www.wildfooduk.com/mushroom-guide/?mushroom_type=poisonous'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
wf_inedible_links = [tag for tag in soup.find_all('td', {'class':'mushroom-image'})]

In [None]:
wf_inedible_links[0]

<td class="mushroom-image">
<a href="https://www.wildfooduk.com/mushroom-guide/inky-mushroom/">
<img alt="Inky Mushroom" class="attachment-custom_mushroom_thumb size-custom_mushroom_thumb wp-post-image" height="166" sizes="(max-width: 250px) 100vw, 250px" src="https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-250x166.jpg" srcset="https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-250x166.jpg 250w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-434x288.jpg 434w" title="Inky Mushroom" width="250"/> </a>
</td>

In [None]:
# get link to each mushroom's details page by removing unnecessary characters
wf_ed_links = [str(link).split('"')[3] for link in wf_edible_links]
wf_ined_links = [str(link).split('"')[3] for link in wf_inedible_links]
wf_ined_links[0]

'https://www.wildfooduk.com/mushroom-guide/inky-mushroom/'

In [None]:
def wildfood_scraper(link_list, list_name):
  '''
  Scrapes picture links from details pages for each mushroom on 
  wildfooduk.com and adds results to a list

  Arguments:
    link_list: list of links to each mushroom's details page
    list_name: empty list to hold results
  '''
  for url in link_list:
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    details = [tag for tag in soup.find_all('img', {'width':'132'}, src=True)]
    list_name += details

In [None]:
# create empty list to hold scraping results
wf_eds, wf_ineds = [], []
# run each list of links through the scraper
wildfood_scraper(wf_ed_links, wf_eds)
wildfood_scraper(wf_ined_links, wf_ineds)
wf_ineds[0]

<img alt="" class="attachment-custom_gallery_thumb size-custom_gallery_thumb" height="99" sizes="(max-width: 132px) 100vw, 132px" src="https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-132x99.jpg" srcset="https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-132x99.jpg 132w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-547x410.jpg 547w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-720x540.jpg 720w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-300x225.jpg 300w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-768x576.jpg 768w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-1024x768.jpg 1024w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-200x150.jpg 200w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-552x414.jpg 552w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-60x45.jpg 60w, https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-533x400.jpg 53

In [None]:
# get image link by removing unnecessary characters
wf_eds = [str(link).split('src')[1].strip('="" ') for link in wf_eds]
wf_ineds = [str(link).split('src')[1].strip('="" ') for link in wf_ineds]
wf_ineds[:5]

['https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-1-132x99.jpg',
 'https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-2-132x99.jpg',
 'https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-3-132x99.jpg',
 'https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-6-132x99.jpg',
 'https://www.wildfooduk.com/wp-content/uploads/2018/01/Inky-4-132x99.jpg']

# LuontoPortti Scraper:
---

In [None]:
# list of page numbers containing results for edible mushrooms
ed_page_nums = [1,2,3,5,6]
# list of page numbers containing results for edible mushrooms
ined_page_nums = [8,9,10]

# set up empty lists to hold results
lp_edible_links, lp_inedible_links = [], []
# iterate through results pages and get a link for each mushroom details page
for i in ed_page_nums:
  url = 'https://www.luontoportti.com/suomi/fi/sienet/?list=' + str(i)
  response = requests.get(url)
  content = response.text
  soup = BeautifulSoup(content, 'html.parser')
  lp_edible_links += [tag for tag in soup.find_all('a', {'rel':'bookmark'})]

# repeat for poisonous mushrooms
for i in ined_page_nums:
  url = 'https://www.luontoportti.com/suomi/fi/sienet/?list=' + str(i)
  response = requests.get(url)
  content = response.text
  soup = BeautifulSoup(content, 'html.parser')
  lp_inedible_links += [tag for tag in soup.find_all('a', {'rel':'bookmark'})]

In [None]:
lp_edible_links[0]

<a href="https://www.luontoportti.com/suomi/fi/sienet/herkkutatti" rel="bookmark">
<img alt="" height="100" src="https://www.luontoportti.com/suomi/images/27669t.jpg" width="100"/>
</a>

In [None]:
# get link to each mushroom's details page by removing unnecessary characters
lp_ed_links = [str(link).split('"')[1] for link in lp_edible_links]
lp_ined_links = [str(link).split('"')[1] for link in lp_inedible_links]
lp_ined_links[0]

'https://www.luontoportti.com/suomi/fi/sienet/kangaskarpassieni'

In [None]:
def luotoportti_scraper(links, list_name):
  '''
  Scrapes picture links from details pages for each mushroom on 
  luotoportti.com and adds results to a list

  Arguments:
    link_list: list of links to each mushroom's details page
    list_name: empty list to hold results
  '''
  for url in links:
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    details = [tag for tag in soup.find_all('a', {'rel':'shadowbox[-]smd'})]
    list_name += details

In [None]:
# create empty lists to hold scraping results
lp_eds, lp_ineds = [], []
# run each list of links through the scraper
luotoportti_scraper(lp_ed_links, lp_eds)
luotoportti_scraper(lp_ined_links, lp_ineds)
lp_eds[0]

<a href="https://www.luontoportti.com/suomi/images/18969.jpg" rel="shadowbox[-]smd" title=""><img alt="" src="https://www.luontoportti.com/suomi/images/18969t.jpg"/></a>

In [None]:
# get image link by removing unnecessary characters
lp_eds = [str(link).split('"')[1] for link in lp_eds]
lp_ineds = [str(link).split('"')[1] for link in lp_ineds]
lp_ineds[:5]

['https://www.luontoportti.com/suomi/images/24377.jpg',
 'https://www.luontoportti.com/suomi/images/19085.jpg',
 'https://www.luontoportti.com/suomi/images/19069.jpg',
 'https://www.luontoportti.com/suomi/images/26643.jpg',
 'https://www.luontoportti.com/suomi/images/26645.jpg']

# Foraging Guide Scraper:
---

In [None]:
# get all links on the results page for edible mushrooms on foragingguide.com
url = 'http://www.foragingguide.com/mushrooms/edible_by_common_name'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
fg_ed_details = [tag for tag in soup.find_all('div', {'class':'info'})]

# # repeat for poisonous mushrooms
url = 'http://www.foragingguide.com/mushrooms/poisonous_by_common_name'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
fg_ined_details = [tag for tag in soup.find_all('div', {'class':'info'})]

In [None]:
fg_ined_details[0]

<div class="info"> <div><a href="/mushrooms/sp/beechwood_sickener"><span class="name">Beechwood Sickener</span></a> </div> <div><span class="name">(Russula nobilis)</span></div> <div class="freq_edib">common, poisonous</div></div>

In [None]:
# strip unnecessary characters and add address bar for each page
fg_ed_links = [str(link).split('href=')[1].split('"')[1] for link in fg_ed_details]
fg_ed_links = ['http://www.foragingguide.com' + link for link in fg_ed_links]
fg_ined_links = [str(link).split('href=')[1].split('"')[1] for link in fg_ined_details]
fg_ined_links = ['http://www.foragingguide.com' + link for link in fg_ined_links]
fg_ined_links[:5]

['http://www.foragingguide.com/mushrooms/sp/beechwood_sickener',
 'http://www.foragingguide.com/mushrooms/sp/brown_roll_rim',
 'http://www.foragingguide.com/mushrooms/sp/common_inkcap',
 'http://www.foragingguide.com/mushrooms/sp/deathcap',
 'http://www.foragingguide.com/mushrooms/sp/turban_fungus']

In [None]:
def foragingguide_scraper(links, list_name):
  '''
  Scrapes picture links from details pages for each mushroom on 
  foraging.com and adds results to a list

  Arguments:
    link_list: list of links to each mushroom's details page
    list_name: empty list to hold results
  '''
  for url in links:
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    details = [tag for tag in soup.find_all('img', {'height':'100'})]
    list_name += details

In [None]:
# set up lists to hold results
fg_eds, fg_ineds = [], []

# get all image links for edible mushrooms
foragingguide_scraper(fg_ed_links, fg_eds)
# get all image links for inedible mushrooms
foragingguide_scraper(fg_ined_links, fg_ineds)
fg_ineds[0]

<img class="img_thumb" height="100" src="http://static.foragingguide.com/photos/mushrooms/beechwood_sickener/thumb/130.jpg" title="Click to enlarge" width="100"/>

In [None]:
# remove unnecessary characters to get list of urls
fg_eds = [str(link).split('"')[5] for link in fg_eds]
fg_ineds = [str(link).split('"')[5] for link in fg_ineds]
fg_ineds[0]

'http://static.foragingguide.com/photos/mushrooms/beechwood_sickener/thumb/130.jpg'

In [None]:
eds = len(mw_ed_links + wf_eds + lp_eds + fg_eds)
ineds = len(mw_ined_links + wf_ineds + lp_ineds + fg_ineds)
print(f'Image results contain: {eds} edible mushrooms and {ineds} poisonous mushrooms')

Image results contain: 4403 edible mushrooms and 1054 poisonous mushrooms


# Store images in Drive:
---

In [None]:
def write_files(train_val, m_type, links):
  '''  
  Requests each image and stores in designated google drive folder

  Arguments:
    train_val: str, folder to be stored in, training or validation
    m_type: str, edible or inedible
    links: list of links for images to be requested
  '''
  PATH = '/content/drive/MyDrive/Data_Science_Course/Capstones/Capstone_4/new_data/mush/'+ train_val + '/' + m_type + '/'
  for i, link in enumerate(links):
    response = requests.get(link)
    if response.ok:
      img_data = response.content
      name = m_type + '_' + str(i) + '.jpg'
      with open(PATH + name, 'wb') as f:
        f.write(img_data)

In [None]:
def store_images(m_type, links):
  '''
  Divides into test and validation sets and then runs each set through the write_files function

  Arguments:
    m_type: str, edible or inedible
    links: list of links for images to be requested
  '''
  # find number of images to be used for validation
  num = math.floor(len(links)*.2)

  # get random sample of 20% to be used as validation data
  val_links = random.sample(links, num)

  # use remaining data for training
  tr_links = [link for link in links if not link in val_links]

  write_files('m_train', m_type, tr_links)
  write_files('m_val', m_type, val_links)

In [None]:
# compile link lists from different sites
eds = set(mw_ed_links + wf_eds + lp_eds + fg_eds)
ineds = set(mw_ined_links + wf_ineds + lp_ineds + fg_ineds)

In [None]:
# run each list through the store_images function
store_images('edible', eds)
store_images('poisonous', ineds)

# Info:
---

In [None]:
def mw_info_scraper(m_type, info_list, page_range):
  '''
  Function to scrape mushroom descriptions from results of search 
  on mushroom.world and add results to passed list
  '''
  for page in page_range:
    # iterate through each page of search results and generate appropriate url
    if page == 0:
      url='http://www.mushroom.world/mushrooms/' + m_type
    else:
      url = 'http://www.mushroom.world/mushrooms/'+ m_type + '?page=' + str(page)
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    # get all 'div' tags of class 'longtextus and save to list
    details = [tag.get_text() for tag in soup.find_all('div', {'class':'longtextus'})]
    info_list += details

In [None]:
mw_ed_info, mw_ined_info = [], []
mw_info_scraper('edible', mw_ed_info, range(0,8))
mw_info_scraper('poisonous', mw_ined_info, range(0,4))

In [None]:
# # save edible mushroom info to drive files
PATH = '/content/drive/MyDrive/Data_Science_Course/Capstones/Capstone_4/mushroom_info/ed_info/'
for i, j in enumerate(mw_ed_info):
  with open(PATH + f'edible{i}.txt', 'wb') as f:
    f.write(j.encode())

# save inedible mushroom info to drive files
PATH = '/content/drive/MyDrive/Data_Science_Course/Capstones/Capstone_4/mushroom_info/ined_info/'
for i, j in enumerate(mw_ined_info):
  with open(PATH + f'inedible{i}.txt', 'wb') as f:
    f.write(j.encode())

# Names:
---

In [None]:
def mw_name_scraper(m_type, name_list, page_range):
  '''
  Function to scrape names and picture links from results of search 
  on mushroom.world and add results to passed list
  '''
  for page in page_range:
    # iterate through each page of search results and generate appropriate url
    if page == 0:
      url='http://www.mushroom.world/mushrooms/' + m_type
    else:
      url = 'http://www.mushroom.world/mushrooms/'+ m_type + '?page=' + str(page)
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    # get all 'div' 'caption' tags (contain names) and save to list
    name_details = [tag for tag in soup.find_all('div', {'class':'caption'})]
    name_list += name_details

In [None]:
mw_ined_names = []
mw_name_scraper('poisonous', mw_ined_names, range(0,4))

mw_ined_names = [str(i).split('>')[2].split('\r\n')[1].strip(' ').lower() for i in mw_ined_names]

In [None]:
# repeat for poisonous mushrooms, add to list of inedible
url = 'https://www.wildfooduk.com/mushroom-guide/?mushroom_type=poisonous'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
wf_ined_list = [tag for tag in soup.find_all('td', {'class':'spotlight-text'})]

In [None]:
# wf_ined_names = [str(name).split('\n')[2].split('<')[0].strip().lower() for name in wf_ined_list]
# wf_ined_names

In [None]:
mw_ed_names = []
mw_name_scraper('edible', mw_ed_names, range(0,4))

mw_ed_names = [str(i).split('>')[2].split('\r\n')[1].strip(' ').lower() for i in mw_ed_names]
mw_ed_names[:10]

['agaricus arvensis',
 'agaricus augustus',
 'albatrellus ovinus',
 'armillaria mellea',
 'boletus badius',
 'boletus edulis',
 'boletus pinophilus',
 'boletus subtomentosus',
 'bondarzewia berkeleyi',
 'bovista plumbea']

In [None]:
# mw_ed_names

In [None]:
new = [x for x in wf_ined_names if not x in mw_ined_names]

In [None]:
all = new + mw_ined_names

In [None]:
mw_ined_names

['amanita muscaria',
 'amanita pantherina',
 'amanita phalloides',
 'amanita porphyria',
 'amanita regalis',
 'amanita rubescens',
 'amanita virosa',
 'boletus satanas',
 'clathrus ruber',
 'coprinopsis atramentaria',
 'cortinarius orellanus',
 'cortinarius rubellus',
 'cortinarius semisanguineus',
 'galerina marginata',
 'gyromitra esculenta',
 'gyromitra infula',
 'hebeloma crustuliniforme',
 'hebeloma mesophaeum',
 'hypholoma fasciculare',
 'inocybe lacera',
 'lactarius helvus',
 'macrolepiota  rachodes',
 'mycena pura',
 'paxillus involutus',
 'russula emetica',
 'stropharia hornemannii']