<a href="https://colab.research.google.com/github/ColeDCrawford/mp-scrape/blob/master/Mountain_Project_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import re

import pprint
pp = pprint.PrettyPrinter(indent=2)

idPattern = re.compile("^[0-9]{1,10}$")
uniqueRouteIds = set()
route_location_ids = []

import pandas as pd
from google.colab import files
import time

from google.colab import drive
drive.mount('/content/gdrive')

import csv

In [None]:
def getId(url):
  values = url.split("/")
  id = values[4]
  if idPattern.match(id) is not None:
    return int(id)
  else:
    print(f"Error: id {id} url {url}")
    return False

def getRoutes(soup):
  # returns tuple of hrefs, ids
  links = soup.select("#left-nav-route-table a")
  hrefs = []
  ids = []
  for link in links:
    href = link.get("href")
    ids.append(getId(href))
    hrefs.append(href)
  return hrefs, ids

def getStates(soup):
    groups = soup.find_all("div", class_="mb-half")
    states_dict = {}
    for group in groups:
        strong = group.find("strong")
        link = strong.find("a")
        url = link.get("href")
        id = getId(url)
        states_dict[link.get("href")] = {
            "url": url,
            "title": link.get_text(),
            "id": id,
            "subareas": [],
            "parent": None,
            "routes": False,
            "subAreaUrls": [],
            "subAreaIds": []
        }
    return states_dict  

def getAreas(url):
    page = requests.get(url)
    soup = bs(page.content)
    areas = []
    ids = []

    divs = soup.find_all("div", class_="lef-nav-row")
    for div in divs:
      link = div.find("a")
      href = link.get("href")
      areas.append(href)

      id = getId(href)
      ids.append(id)
    return areas, ids

def getSubAreas(url, level=0, parent=None, shouldGetRoutes=False):
  # time.sleep(.5)
  page = requests.get(url)
  soup = bs(page.content)
  title = soup.find("h1").find(text=True, recursive=False)
  title = title.strip()
  id = getId(url)
  print(title)

  try:
    header_list = soup.select(".mp-sidebar h3")
    header = header_list[0]
    if "Routes in " in header.get_text():
      routeArea = {
          "url": url,
          "title": title,
          "level": level,
          "parent": parent,
          "id": id,
          "routes": True,
      }
      if shouldGetRoutes:
        hrefs, ids = getRoutes(soup)
        routeArea["routeIds"] = ids
        routeArea["routeLinks"] = hrefs
        uniqueRouteIds.update(ids)
        joined = []
        for routeId in ids:
          route_location_ids.append((routeId, id))
      subareas.append(routeArea)
    elif "Areas in " in header.get_text():
      ids = []
      links = []
      divs = soup.find_all("div", class_="lef-nav-row")
      print(f"Subareas: {len(divs)}")

      for div in divs:
        link = div.find("a").get("href")
        links.append(link)
        ids.append(getId(link))
      subareas.append({
          "url": url,
          "title": title,
          "level": level,
          "parent": parent,
          "id": id,
          "routes": False,
          "subAreaUrls": links,
          "subAreaIds": ids
      })
      level += 1
      for link in links:
        getSubAreas(link, level=level, parent=url, shouldGetRoutes=shouldGetRoutes)

    else:
      # area without subarea or routes
      print(f"area without routes or subareas - {header.get_text()}")
      subareas.append({
          "url": url,
          "title": title,
          "level": level,
          "parent": parent,
          "id": id,
          "routes": False
      })
  except IndexError:
    print(f"IndexError - {title}")
    subareas.append({
      "url": url,
      "title": title,
      "level": level,
      "id": id,
      "parent": parent,
      "routes": False
    })

In [None]:
homepage = requests.get("https://www.mountainproject.com/")
soup = bs(homepage.content)

states = getStates(soup)
states_keys = ['https://www.mountainproject.com/area/106092653/iowa', 'https://www.mountainproject.com/area/107235316/kansas', 'https://www.mountainproject.com/area/105868674/kentucky', 'https://www.mountainproject.com/area/116720343/louisiana', 'https://www.mountainproject.com/area/105948977/maine', 'https://www.mountainproject.com/area/106029417/maryland', 'https://www.mountainproject.com/area/105908062/massachusetts', 'https://www.mountainproject.com/area/106113246/michigan', 'https://www.mountainproject.com/area/105812481/minnesota', 'https://www.mountainproject.com/area/108307056/mississippi', 'https://www.mountainproject.com/area/105899020/missouri', 'https://www.mountainproject.com/area/105907492/montana', 'https://www.mountainproject.com/area/116096758/nebraska', 'https://www.mountainproject.com/area/105708961/nevada', 'https://www.mountainproject.com/area/105872225/new-hampshire', 'https://www.mountainproject.com/area/106374428/new-jersey', 'https://www.mountainproject.com/area/105708964/new-mexico', 'https://www.mountainproject.com/area/105800424/new-york', 'https://www.mountainproject.com/area/105873282/north-carolina', 'https://www.mountainproject.com/area/106598130/north-dakota', 'https://www.mountainproject.com/area/105994953/ohio', 'https://www.mountainproject.com/area/105854466/oklahoma', 'https://www.mountainproject.com/area/105708965/oregon', 'https://www.mountainproject.com/area/105913279/pennsylvania', 'https://www.mountainproject.com/area/106842810/rhode-island', 'https://www.mountainproject.com/area/107638915/south-carolina', 'https://www.mountainproject.com/area/105708963/south-dakota', 'https://www.mountainproject.com/area/105887760/tennessee', 'https://www.mountainproject.com/area/105835804/texas', 'https://www.mountainproject.com/area/105708957/utah', 'https://www.mountainproject.com/area/105891603/vermont', 'https://www.mountainproject.com/area/105852400/virginia', 'https://www.mountainproject.com/area/105708966/washington', 'https://www.mountainproject.com/area/105855459/west-virginia', 'https://www.mountainproject.com/area/105708968/wisconsin', 'https://www.mountainproject.com/area/105708960/wyoming', 'https://www.mountainproject.com/area/105907743/international', 'https://www.mountainproject.com/area/105798164/in-progress']
skip = ['https://www.mountainproject.com/area/105905173/alabama', 'https://www.mountainproject.com/area/105909311/alaska','https://www.mountainproject.com/area/105708962/arizona', 'https://www.mountainproject.com/area/105901027/arkansas', 'https://www.mountainproject.com/area/105708959/california', 'https://www.mountainproject.com/area/105708956/colorado', 'https://www.mountainproject.com/area/105806977/connecticut', 'https://www.mountainproject.com/area/106861605/delaware', 'https://www.mountainproject.com/area/111721391/florida', 'https://www.mountainproject.com/area/105897947/georgia', 'https://www.mountainproject.com/area/106316122/hawaii', 'https://www.mountainproject.com/area/105708958/idaho', 'https://www.mountainproject.com/area/105911816/illinois', 'https://www.mountainproject.com/area/112389571/indiana']
for key in skip:
  states.pop(key)
print(states)
last = skip[-1].split("/")[-1]
print(last)

for key in states:
  areas, ids = getAreas(key)
  states[key]['subAreaIds'] = ids
  states[key]['subAreaUrls'] = areas
  subareas = []
  print(f"\n\n ------{key} ------")
  for area in areas:
    getSubAreas(area, parent=key, shouldGetRoutes=True)
  states[key]['subareas'] = subareas
  print(f"Subareas scraped in this state: {len(subareas)}")
  print(f"Total route IDs scraped: {len(uniqueRouteIds)}")

  data = cleanDataForExport(states)
  df_areas = pd.DataFrame(data)
  # df_areas.to_csv(f"/content/gdrive/'My Drive'/'Mountain Project Scrape'/areas.csv", index=False)
  df_areas.to_csv(f"/content/gdrive/My Drive/Mountain Project Scrape/areas-after-{last}.csv", index=False)

  df_routes = pd.DataFrame(uniqueRouteIds)
  df_routes.to_csv(f"/content/gdrive/My Drive/Mountain Project Scrape/route_ids-after-{last}.csv", index=False)

  with open(f"/content/gdrive/My Drive/Mountain Project Scrape/route_loc_ids-after-{last}.csv", "w", newline='') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(['routeId', 'locationId'])
    for row in route_location_ids:
      csv_out.writerow(row)

In [None]:
def cleanDataForExport(dirty):
  areas = []
  for key, value in dirty.items():
    areas.append({
        'parent': value['parent'],
        'title': value['title'],
        'url': value['url'],
        'subAreaIds': value['subAreaIds'],
        'subAreaUrls': value['subAreaUrls'],
        'routes': value['routes'],
        'id': value['id']
    })
    areas.extend(value['subareas'])
  return areas

In [None]:
# # Export Routes
# df_routes = pd.DataFrame(uniqueRouteIds)
# df_routes.to_csv('route_ids.csv', index=False)

# # Export Areas
# data = cleanDataForExport(states)
# df_areas = pd.DataFrame(data)
# df_areas.to_csv('areas.csv', index=False)

In [None]:
all_routes = []

import csv
route_location_ids = {}
with open("/content/gdrive/My Drive/Mountain Project Scrape/route_loc_ids.csv") as read_obj:
  csv_dict_reader = csv.DictReader(read_obj)
  for row in csv_dict_reader:
    route_location_ids[int(row['routeId'])] = int(row['locationId'])

def get_routes(ids, pause=5):
  time.sleep(pause)
  str_ids = ','.join(str(x) for x in ids)
  params = {
      'key': apikey,
      'routeIds': str_ids
  }
  r = requests.get('https://www.mountainproject.com/data/get-routes', params=params)
  if(r.json()['success'] == 1):
    routes = r.json()['routes']
    # enhance with location ID
    for route in routes:
      locationId = route_location_ids[route['id']]
      route['locationId'] = locationId
    return routes
  else:
    print("Error")
    print(r.json())
    return False

In [None]:
# Get all routes
routeIds = list(route_location_ids.keys())

def divide_chunks(l, n): 
  for i in range(0, len(l), n):  
    yield l[i:i + n]

chunked = list(divide_chunks(routeIds, 200))
i = 0
for chunk in chunked:
  routes = get_routes(chunk)
  all_routes.extend(routes)