In [1]:
import requests
from bs4 import BeautifulSoup
import json

# scraping a route page

In [80]:
def get_route_data(route_url):
    text = requests.get(route_url).text
    soup = BeautifulSoup(text, 'html.parser')
    data = json.loads("".join(soup.find("script", {"type":"application/ld+json"}).contents))
    description = soup.find('div', {'class': 'fr-view'}).contents
    data['description'] = ''.join([x for x in description if isinstance(x, str)])
    data['route_url'] = route_url
    return data

In [81]:
route = "https://www.mountainproject.com/route/105933562/"
data = get_route_data(route)
data

{'@context': 'http://schema.org/',
 '@type': 'Place',
 'name': 'Exum Ridge',
 'description': "This is a fantastic route and is extremely popular, among guided and non-guided parties.  The route is significantly longer than the Owen-Spalding, the rock is generally excellent and the views spectacular.  This is also a good winter route as it melts off quickly.  One problem is that it is difficult to escape off this route in case of bad weather.    If you free solo this route it goes very quickly.  Not only was the first ascent by Glenn Exum done in this manner, but the second ascent was made by Paul Petzoldt on the same day, AFTER he had guided his clients to the summit via the Owen Spalding!  However most people rope up on this climb, and if you pitch out the whole thing it can go very slowly.  The climb looks long, and it is even longer than it looks.  Consider simulclimbing the easier sections.  The Grade II comes from the Ortenburger-Jackson guide, but in my opinion this must assume y

# scraping a user page

In [4]:
def get_user_history(user_url):
    links = []
    text = requests.get(user_url + '/ticks').text
    soup = BeautifulSoup(text, 'html.parser')
    num_pages = int(soup.find_all('a', {"class":"no-click"})[2].contents[0].strip()[-1])
    for i in range(num_pages):
        text = requests.get(user + '/ticks?page=' + str(i + 1)).text
        soup = BeautifulSoup(text, 'html.parser')
        all_links = soup.find_all('a')
        for link in all_links:
            if len(link.find_all('strong')) > 0 and len(link) < 2:
                links.append({link.find('strong').contents[0]: link.get('href')})
    return links

In [5]:
user = 'https://www.mountainproject.com/user/10959/frances-fierst'
history = get_user_history(user)
history

[{'Sleepy Hallow': 'https://www.mountainproject.com/route/106599007/sleepy-hallow'},
 {'Girls with Chim(n)ay': 'https://www.mountainproject.com/route/111170331/girls-with-chimnay'},
 {'Bushwhacker': 'https://www.mountainproject.com/route/106599016/bushwhacker'},
 {'Route 66': 'https://www.mountainproject.com/route/106517711/route-66'},
 {'Good Dobby': 'https://www.mountainproject.com/route/108416293/good-dobby'},
 {'You Break It You Buy It': 'https://www.mountainproject.com/route/109619465/you-break-it-you-buy-it'},
 {"Kibbles 'n Bits": 'https://www.mountainproject.com/route/118229581/kibbles-n-bits'},
 {'Wherever I May Roam': 'https://www.mountainproject.com/route/105821077/wherever-i-may-roam'},
 {'Unknown crack': 'https://www.mountainproject.com/route/119654556/unknown-crack'},
 {'Fingers to hands': 'https://www.mountainproject.com/route/114460429/fingers-to-hands'},
 {'Ringlock crack': 'https://www.mountainproject.com/route/114459436/ringlock-crack'},
 {'The Dihedral': 'https://www

# scraping data from an entire state

In [6]:
def find_all_routes_in_state(state_url):
    def add_links(soup):
        div = soup.find('div', {'class': 'max-height max-height-md-0 max-height-xs-400'})
        if div:
            a_hrefs = div.find_all('a')
            for link in a_hrefs:
                links.append(link.get('href'))
    text = requests.get(state_url).text
    soup = BeautifulSoup(text, 'html.parser')
    links = []
    route_links = []
    add_links(soup)
    while len(links) > 0:
        link = links[0]
        if '/area/' in link:
            text = requests.get(link).text
            new_soup = BeautifulSoup(text, 'html.parser')
            add_links(new_soup)
        else:
            if link != '#':
                route_links.append(link)
        links.remove(link)
    return route_links

In [7]:
state = 'https://www.mountainproject.com/area/105833381/yosemite-national-park'
all_routes = find_all_routes_in_state(state)
all_routes

['https://www.mountainproject.com/route/111004304/the-jet',
 'https://www.mountainproject.com/route/113689801/cupcake',
 'https://www.mountainproject.com/route/107753089/face-ramp-crack',
 'https://www.mountainproject.com/route/110648065/finger-crack',
 'https://www.mountainproject.com/route/107753112/wandering-albatross',
 'https://www.mountainproject.com/route/119513779/resurrection',
 'https://www.mountainproject.com/route/107751556/camp-mather',
 'https://www.mountainproject.com/route/107751576/variation',
 'https://www.mountainproject.com/route/118812991/spliter-handcrack',
 'https://www.mountainproject.com/route/108167876/amys-53',
 'https://www.mountainproject.com/route/108165000/southeast-face',
 'https://www.mountainproject.com/route/108178106/west-face-of-mount-starr-king',
 'https://www.mountainproject.com/route/111866548/northwest-arete',
 'https://www.mountainproject.com/route/109191270/fuel-rod',
 'https://www.mountainproject.com/route/106482674/high-heels',
 'https://www

In [8]:
len(all_routes)

2233

In [82]:
all_data = []
for route in all_routes:
    all_data.append(get_route_data(route))

In [83]:
all_data

[{'@context': 'http://schema.org/',
  '@type': 'Place',
  'name': 'The Jet',
  'description': 'A cool small route with some big, dynamic moves.',
  'image': 'https://cdn2.apstatic.com/photos/climb/111004329_medium_1494337531.jpg',
  'geo': {'@type': 'GeoCoordinates',
   'latitude': '37.98546707',
   'longitude': '-119.91680796'},
  'aggregateRating': {'@type': 'AggregateRating',
   'ratingValue': '3.0',
   'reviewCount': '3'},
  'route_url': 'https://www.mountainproject.com/route/111004304/the-jet'},
 {'@context': 'http://schema.org/',
  '@type': 'Place',
  'name': 'Cupcake',
  'description': "The most obvious route up the small crag at Glacier Point.    Lowest third involves a bit of good footwork, then an easy scramble for the middle third, then the top bit is probably the crux and trickier than it looks.    The base is on the disabled trail to Glacier Point so be very careful not to obstruct it - however the trail is very wide at that point - easily wide enough for buggies with room

In [84]:
with open('yosemite_national_park_all_data.json', 'w') as f:
    json.dump(all_data, f)