In [1]:
import requests
from bs4 import BeautifulSoup, Tag
import json
import pandas as pd

# scraping a route page

In [74]:
def get_route_data(route_url):
    text = requests.get(route_url).text
    soup = BeautifulSoup(text, 'html.parser')
    climb_type = soup.find('td', string='Type:').next_sibling.next_sibling.contents[0].strip()
    if 'Aid' in climb_type or 'Ice' in climb_type or 'Mixed' in climb_type:
        return None
    difficulty_section = soup.find('h2', {'class': 'inline-block mr-2'})
    if difficulty_section is None:
        difficulty_rating = 'NA'
        difficulty_rating_system = 'NA'
    else:
        if isinstance(difficulty_section.contents[0], str):
            difficulty_rating = difficulty_section.contents[0]
            difficulty_rating_system = 'NA'
        else:
            difficulty_rating = difficulty_section.contents[0].contents[0]
            difficulty_rating_system = difficulty_section.contents[0].contents[1].contents[0].contents[0]
    if difficulty_rating_system != 'YDS':
        return None
    
    data = json.loads("".join(soup.find("script", {"type":"application/ld+json"}).contents))
    
    description_section = soup.find_all('div', {'class': 'fr-view'})
    description = description_section[0].contents
    if len(description_section) == 3:
        protection = description_section[2].contents
    elif len(description_section) == 2:
        protection = description_section[1].contents
    else:
        protection = 'No protection data'
    
    data['climb_type'] = climb_type
    data['description'] = ''.join([x for x in description if isinstance(x, str)])
    data['protection'] = ''.join([x for x in protection if isinstance(x, str)])
    data['difficulty_rating'] = difficulty_rating
    data['difficulty_rating_system'] = difficulty_rating_system
    data['route_url'] = route_url
    return data

In [75]:
route = "https://www.mountainproject.com/route/109933221"
data = get_route_data(route)
data

{'@context': 'http://schema.org/',
 '@type': 'LocalBusiness',
 'name': 'Yosemite Slab',
 'description': 'Huge, grand, stunning. Walk your way up the perfect slab.',
 'image': 'https://cdn2.apstatic.com/photos/climb/113579015_medium_1505241287.jpg',
 'geo': {'@type': 'GeoCoordinates',
  'latitude': '37.50631842',
  'longitude': '-88.68419115'},
 'aggregateRating': {'@type': 'AggregateRating',
  'ratingValue': '3.7',
  'reviewCount': '95'},
 'climb_type': 'Boulder, 40 ft (12 m)',
 'protection': 'No. Kind of a do not fall situation.',
 'difficulty_rating': 'V0 ',
 'difficulty_rating_system': 'YDS',
 'route_url': 'https://www.mountainproject.com/route/109933221'}

In [35]:
route = "https://www.mountainproject.com/route/105933562/"
print(route)
route = route.split("/")
print(route)
route.insert(4, "stats")
print(route)
route = "/".join(route)
print(route)

https://www.mountainproject.com/route/105933562/
['https:', '', 'www.mountainproject.com', 'route', '105933562', '']
['https:', '', 'www.mountainproject.com', 'route', 'stats', '105933562', '']
https://www.mountainproject.com/route/stats/105933562/


In [3]:
route_stats_url = "https://www.mountainproject.com/route/stats/105933562/"

In [40]:
def get_route_ratings_data(route_url):
    text = requests.get(route_url).text
    soup = BeautifulSoup(text, 'html.parser')
    ratings_table = soup.find("table", attrs={"class": "table"})
    rows = ratings_table.find_all("tr")
    
    user_ratings = {}
    for row in rows:
        cols = row.find_all("td")
        user_url = cols[0].find("a").get("href")
        user_id = user_url.split("/")[4]
        stars = len(cols[1].find_all("img"))
        
        if(user_id in user_ratings.keys()):
            print(user_id, "appears multiple times")
        else:
            user_ratings[user_id] = stars
            
    return user_ratings

In [41]:
get_route_ratings_data(route_stats_url)

{'107518795': 4,
 '107535054': 4,
 '107537378': 4,
 '106622079': 4,
 '106329936': 4,
 '106726952': 4,
 '105865750': 4,
 '107405626': 4,
 '106381795': 4,
 '107568281': 4,
 '105789553': 4,
 '10146': 4,
 '10141': 4,
 '14213': 4,
 '107598776': 4,
 '12988': 4,
 '107468926': 4,
 '105812440': 4,
 '105857129': 4,
 '11048': 4,
 '106588555': 4,
 '107659993': 4,
 '107383190': 4,
 '105818656': 4,
 '107583733': 4,
 '106958810': 4,
 '106945872': 4,
 '106209198': 4,
 '106907209': 4,
 '105805486': 4,
 '105083583': 4,
 '105802830': 4,
 '11877': 4,
 '105940930': 4,
 '106262777': 4,
 '105930639': 4,
 '105904878': 4,
 '106005105': 4,
 '7002067': 4,
 '105999112': 4,
 '106020445': 4,
 '107025542': 4,
 '107075506': 4,
 '106292269': 4,
 '107738284': 4,
 '12985': 4,
 '10351': 4,
 '106983776': 4,
 '107588101': 4,
 '106049619': 4,
 '11181': 4,
 '107352048': 4,
 '106039240': 4,
 '106055976': 4,
 '105870586': 4,
 '106088551': 4,
 '105832216': 4,
 '105846944': 4,
 '106006184': 4,
 '106365141': 4,
 '10996': 4,
 '106

# scraping a user page

In [5]:
def get_user_history(user_url):
    output = pd.DataFrame(columns=['name', 'url', 'user_rating'])
    text = requests.get(user_url + '/ticks').text
    soup = BeautifulSoup(text, 'html.parser')
    num_pages = int(soup.find_all('a', {"class":"no-click"})[2].contents[0].strip()[-1])
    for i in range(num_pages):
        text = requests.get(user + '/ticks?page=' + str(i + 1)).text
        soup = BeautifulSoup(text, 'html.parser')
        all_links = soup.find_all('a')
        all_ratings = [] #this is the list of all star ratings on the current page
        for link in all_links:
            #this part is finding out the star ratings
            ratings_list = link.find_all('span', {"class":"scoreStars"})
            if len(ratings_list) > 0:
                rating = 0
                for element in ratings_list[0].contents:
                    if isinstance(element, Tag):
                        image = element['src']
                        if image == '/img/stars/starBlue.svg':
                            rating += 1
                        if image == '/img/stars/starBlueHalf.svg':
                            rating += 0.5
                all_ratings.append(rating)
            #this part is adding the data to the final output list
            if len(link.find_all('strong')) > 0 and len(link) < 2:
                #key is the name of the route, value is (route url, user's star rating for this route)
                output.loc[len(output.index)] = [link.find('strong').contents[0], link.get('href'), all_ratings.pop(0)]
    return output.to_dict()

In [6]:
user = 'https://www.mountainproject.com/user/10959/frances-fierst'
history = get_user_history(user)
history

{'name': {0: 'Sleepy Hallow',
  1: 'Girls with Chim(n)ay',
  2: 'Bushwhacker',
  3: 'Route 66',
  4: 'Good Dobby',
  5: 'You Break It You Buy It',
  6: "Kibbles 'n Bits",
  7: 'Wherever I May Roam',
  8: 'Unknown crack',
  9: 'Fingers to hands',
  10: 'Ringlock crack',
  11: 'The Dihedral',
  12: 'No name',
  13: 'Round There',
  14: 'Buffalo Power',
  15: 'Thin Air',
  16: 'Mossy Tits',
  17: "Yoder's Bounce test",
  18: 'The Safe Zone',
  19: 'Purple Moon',
  20: 'Phone Call From Satan',
  21: 'Phone Call From Satan',
  22: 'Moons of Pluto',
  23: 'Screaming Yellow Zonkers',
  24: 'Cosmos',
  25: 'Orgasmophoria',
  26: 'Phantasmagoria',
  27: 'Dances with Clams',
  28: 'Pop Art',
  29: 'Dirty Pinkos',
  30: 'Lets Face It',
  31: 'Honey Pot',
  32: "Chouinard's Crack (1st Half…",
  33: 'Tuff It Out',
  34: 'No Golf Shoes',
  35: 'Snuffy Smith',
  36: 'Leper Messiah',
  37: 'Orion',
  38: 'Bookworm',
  39: 'Ancylostoma',
  40: "Nightingale's on Vacation",
  41: 'Irreverence',
  42: 'Re

# scraping data from an entire state

In [76]:
def find_all_routes_in_state(state_url):
    def add_links(soup):
        div = soup.find('div', {'class': 'max-height max-height-md-0 max-height-xs-400'})
        if div:
            a_hrefs = div.find_all('a')
            for link in a_hrefs:
                links.append(link.get('href'))
    text = requests.get(state_url).text
    soup = BeautifulSoup(text, 'html.parser')
    links = []
    route_links = []
    add_links(soup)
    while len(links) > 0:
        link = links[0]
        if '/area/' in link:
            text = requests.get(link).text
            new_soup = BeautifulSoup(text, 'html.parser')
            add_links(new_soup)
        else:
            if link != '#':
                route_links.append(link)
        links.remove(link)
    return route_links

In [77]:
state = 'https://www.mountainproject.com/area/105877031/mount-rainier'
all_routes = find_all_routes_in_state(state)
all_routes

['https://www.mountainproject.com/route/119283293/bread-loaf',
 'https://www.mountainproject.com/route/106703081/central-mowich-face',
 'https://www.mountainproject.com/route/110037673/curtis-ridge',
 'https://www.mountainproject.com/route/105982534/emmons-glacier',
 'https://www.mountainproject.com/route/106636996/fuhrer-finger',
 'https://www.mountainproject.com/route/107671307/fuhrer-thumb',
 'https://www.mountainproject.com/route/107467946/gibraltar-ledges',
 'https://www.mountainproject.com/route/106812783/ingraham-glacier-direct',
 'https://www.mountainproject.com/route/105877100/ingraham-glacier-disappointment-cleaver-route',
 'https://www.mountainproject.com/route/105906991/kautz-glacier',
 'https://www.mountainproject.com/route/106459197/liberty-ridge',
 'https://www.mountainproject.com/route/111999837/north-mowich-headwall',
 'https://www.mountainproject.com/route/106846325/ptarmigan-ridge',
 'https://www.mountainproject.com/route/112479245/sunset-ridge',
 'https://www.mounta

In [78]:
len(all_routes)

17

In [79]:
all_data = []
for route in all_routes:
    route_data = get_route_data(route)
    if route_data:
        all_data.append(get_route_data(route))

In [80]:
len(all_data)

2

In [81]:
all_data

[{'@context': 'http://schema.org/',
  '@type': 'LocalBusiness',
  'name': 'Bread Loaf',
  'description': '',
  'image': 'https://cdn2.apstatic.com/photos/climb/119283325_medium_1595814614.jpg',
  'geo': {'@type': 'GeoCoordinates',
   'latitude': '46.85290262',
   'longitude': '-121.76047432'},
  'aggregateRating': {'@type': 'AggregateRating',
   'ratingValue': '2.0',
   'reviewCount': '1'},
  'climb_type': 'Boulder, Alpine, 8 ft (2 m)',
  'protection': 'Crash pads, Spotters',
  'difficulty_rating': 'V0 ',
  'difficulty_rating_system': 'YDS',
  'route_url': 'https://www.mountainproject.com/route/119283293/bread-loaf'},
 {'@context': 'http://schema.org/',
  '@type': 'LocalBusiness',
  'name': 'Upper Castle Toprope Wall',
  'description': "There is a 40ft crag about 1-200' elevation above the Castle, just below the Turtle Snowfield on the approach to the Kautz Ice Chute. The rock quality is actually decent. This offers a great pastime at high camp. The Castle camp is at 9,200' and the Upp

In [None]:
with open('yosemite_national_park_all_data.json', 'w') as f:
    json.dump(all_data, f)