<a href="https://colab.research.google.com/github/17jmumford/scrape-canyons/blob/main/scrape_canyons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Welcome!
This is a web scraper designed to collect data from the popular canyoneering website [bluugnome.com](https://www.bluugnome.com). It is currently a work in progress. This was built with assistance from chatGPT.
###How to use this file
1. Select the 'open in colab' button to open the file in Google Colab. 
1. Hit the play buttons on all the boxes, in order.
1. Click the folder button on the left side.
1. You should see 'canyoneering_data.csv' on a list. (hit the folder with a refresh button if it's not popping up right away).
1. Hover over the file and three dots should appear. Click on the three dots and select 'download'. 

Warning: these instructions are for the completed web scraper. Data has errors and requires manual correction. Users should carefully research canyon beta on the source websites and take all precautions before going in a canyon. Canyoneering is dangerous!

### imports

In [52]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Site Scraper

###Scraping the parent page for child pages

In [2]:
parentSoup = BeautifulSoup(requests.get('http://www.bluugnome.com/cyn_route/canyon-area-list.aspx').text, 'html.parser')
routes = parentSoup.find("div", attrs={"id": "div_content"})

regionLinks = []
regionStart = 'http://www.bluugnome.com/cyn_route/'

for link in routes.find_all('a'):
  regionLinks.append((regionStart + link.get('href')))


###Scraping an individual page

In [53]:
hrefStart = 'bluugnome.com/cyn_route/dv/'

def addRegion(link, data):
  print('Current page: ', link)
  pageSoup = BeautifulSoup(requests.get(link).text, 'html.parser')
  
  title = pageSoup.find('h1', {'class' : 'h_general_main_title'}).text.split(':')[1].split(',')
  if len(title) > 1:
    state = title[1].strip().replace('.', '').replace('\n', ' ').replace('\r', '').replace('\t', '')
    region = title[0].strip().replace('\n', ' ').replace('\r', '').replace('\t', '')
  else: 
    state = 'ERROR'
    region = 'ERROR'
  area = '-'
  canyons = pageSoup.find_all("div", {"class": "div_tr-listing"})
  for canyon in canyons:

    data["State"].append(state)
    data["Region"].append(region)
    data['Canyon'].append(canyon.find('a').text.strip().replace('\n', ' ').replace('\r', '').replace('\t', '')) 
    data['Link'].append((hrefStart + canyon.find('a').get('href')))
    
    if canyon.previous_sibling.previous_sibling.name == 'h2':
      area = canyon.previous_element.previous_element.strip().replace('\n', ' ').replace('\r', '').replace('\t', '')
    data['Area'].append(area)

    rating = canyon.find("h3", class_="h_tr-listing").text.strip().replace('\n', ' ').replace('\r', '').replace('\t', '')
    index = 0

    for start in ['1', '2', '3', '4']:
      if (rating.find(start) + 1):
        rating = rating[rating.find(start):]
        break

    appendRating('Difficulty', rating, ['4', '3', '2', '1'])
    appendRating('Wetness', rating, ['C', 'B', 'A'])
    appendRating('Time', rating, ['IV', 'VI', 'V', 'III', 'II', 'I'])
    if 'RK' in rating:
      data['Risk'].append('')
    elif 'R' in rating:
      data['Risk'].append('R')
    elif 'X' in rating:
      data['Risk'].append('X')
    else:
      data['Risk'].append('')
    trails = canyon.find_all("p", {"class": "p_tr-listing"})
    if len(trails) == 0:
      trails = canyon.find_all("p", {"class": "p_general"})
 
    repeat = False
    counter = 1
    for trail in trails:
      if trail.find('strong'):
        if len(trails) > 1:
          data['Route'].append(('Route #' + str(counter)))
        else:
          data['Route'].append('Main route')
        if counter > 1:
          data["State"].append(data["State"][-1])
          data["Region"].append(data["Region"][-1])
          data["Area"].append(data["Area"][-1])
          data["Canyon"].append(data["Canyon"][-1])
          data["Link"].append(data['Link'][-1])
          data["Difficulty"].append(data["Difficulty"][-1])
          data["Wetness"].append(data["Wetness"][-1])
          data["Time"].append(data["Time"][-1])
          data["Risk"].append(data["Risk"][-1])

        appendSibling('Distance', trail)
        appendSibling('Shuttle', trail)
        appendSibling('Vehicle', trail)
        appendSibling('Permit', trail)
        appendSibling('Elevation', trail)

        rappel_string = trail.find('strong', string='Rappels -')
        if rappel_string:
          pattern = re.compile(r'(\d+).*?(\d+)')
          match = re.search(pattern, rappel_string.next_sibling)
          if match:
            data['Rappels'].append(int(match.group(1)))      
            data['Max Rappel'].append(int(match.group(2)))
          else:
            data['Rappels'].append(0)
            data['Max Rappel'].append(0)
        else:
          data['Rappels'].append(0)
          data['Max Rappel'].append(0)
        counter += 1
    #troubleshoot(data)


def appendSibling(cName, routeInfo):
  found = routeInfo.find('strong', string=re.compile(cName + "(.*)"))
  if found:
      try:
        data[cName].append(found.next_sibling.strip().replace('\n', ' ').replace('\r', '').replace('\t', ''))
      except Exception as e:
        print(e)
        print(cName)
        print(routeInfo)
        data[cName].append('ERROR')
  else:
    data[cName].append('ERROR')

def appendRating(rName, rating, options):
  optionMissing = True
  for option in options:
    if option in rating:
      data[rName].append(option)
      optionMissing = False
      break
  if optionMissing:
    data[rName].append('ERROR')

### Looping through all the child pages and exporting the data

In [54]:
data = {
    'State' : [],
    'Region' : [],
    'Area' : [],
    'Canyon' : [],
    'Route' : [],
    'Difficulty' : [],
    'Wetness' : [], 
    'Time' : [],
    'Risk' : [],
    'Distance' : [],
    'Rappels' : [],
    'Max Rappel' : [],
    'Elevation' : [],
    'Shuttle' : [],
    'Vehicle' : [],
    'Permit' : [],
    'Link' : []
}

for regionLink in regionLinks:
  addRegion(regionLink, data)

df = pd.DataFrame(data)
df.to_csv(r'canyon_data.csv', index=False)


Current page:  http://www.bluugnome.com/cyn_route/coconino-nat-frst/canyon-routes__coconino-nat-frst.aspx
Current page:  http://www.bluugnome.com/cyn_route/vr-gorge/canyon-routes__vr-gorge.aspx
Current page:  http://www.bluugnome.com/cyn_route/dv/canyon-routes__dv.aspx
Current page:  http://www.bluugnome.com/cyn_route/red-rock_nv/canyon-routes__red-rock_nv.aspx
Current page:  http://www.bluugnome.com/cyn_route/charleston_nv/canyon-routes__charleston_nv.aspx
Current page:  http://www.bluugnome.com/cyn_route/lake-mead/canyon-routes__lake-mead.aspx
Current page:  http://www.bluugnome.com/cyn_route/stateline-hills_nv/canyon-routes__stateline-hills_nv.aspx
Current page:  http://www.bluugnome.com/cyn_route/valley-of-fire/canyon-routes__valley-of-fire.aspx
Current page:  http://www.bluugnome.com/cyn_route/cap-reef/canyon-routes__cap-reef.aspx
Current page:  http://www.bluugnome.com/cyn_route/cedar-mesa/canyon-routes__cedar-mesa.aspx
Current page:  http://www.bluugnome.com/cyn_route/dirty-devi

In [159]:
#troubleshooting code
def troubleshoot(data):
  print(len(data['State']))
  print(len(data['Region']))
  print(len(data['Area']))
  print(len(data['Canyon']))
  print(len(data['Route']))
  print(len(data['Difficulty']))
  print(len(data['Wetness' ]))
  print(len(data['Time']))
  print(len(data['Distance']))
  print(len(data['Rappels']))
  print(len(data['Max Rappel']))
  print(len(data['Elevation']))
  print(len(data['Shuttle']))
  print(len(data['Vehicle']))
  print(len(data['Permit']))
  print(len(data['Link']))

#Experimentation block
(delete before final version)

In [133]:
'''
<h1 class="h_general_main_title">
       Canyoneering Route Description List:
       <br/>
       Coconino National Forest, Arizona.
      </h1>
'''
title = experimentSoup.find('h1', {'class' : 'h_general_main_title'}).text.split(':')[1].split(',')

print(title[1].strip().replace('.', ''))
print(title[0].strip())

Arizona
Coconino National Forest


#Using beautiful Soup

Basic info on beautiful soup for those wishing to build something similar. (points by chatGPT)

* Use the find() and find_all() methods to locate the specific elements you want to extract. These methods accept a variety of arguments, such as the name of the tag (e.g. 'p' for paragraphs), the class name (if the element has one), and the id (if the element has one).

* Use the string attribute to extract the text content of an element. This attribute returns the text between the opening and closing tags of the element, without the tags themselves.

* Use the .contents and .children attributes to navigate the HTML tree and extract information from nested elements. The .contents attribute returns a list of the element's children, while the .children attribute returns an iterator over the element's children.

* Use the .parent attribute to access the parent of an element. This can be helpful when you want to extract information from the parent element as well as the element itself.