# Trail Recommendations

List everything that we need to do.

In [1]:
import Trailforks as tf
import TrailforksScraper as tfs
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Import all the files

In [2]:
trails_checkins = pd.read_csv('trail-checkins.csv')
trailforks_trails = pd.read_csv('trailforks-trails.csv')
wta_trails = pd.read_csv('ResultsZip.csv')

## Getting data from Trailforks

- 'activitytype=6' for hiking
- Trailstyle = Popularity

Trailforks api documentation: https://www.trailforks.com/about/api/#!/trail

Trailforks popularity heatmap: https://www.trailforks.com/region/united-states/?activitytype=6&z=10.4&lat=48.30699&lon=-120.42421&trailstyle=popularity

Trailforks popularity scores (sorted in descending order): https://www.trailforks.com/region/united-states/trails/?sort=t.popularity_score&order=desc&difficulty=2,3,4,11,9,5,6,8&activitytype=6

In [3]:
trailForksScrapper = tfs.trailforksScrapper()

In [38]:
# This code was used to scrate all the trails in north-carolina and washington 
# from trailforks along with their popularity.

# north_carolina = trailForksScrapper.fetchTrailsByRegionAndPages('north-carolina',23)
# washington = trailForksScrapper.fetchTrailsByRegionAndPages('washington',72)

Import data from all the saved files

In [37]:
trails_scraped = trails_checkins['trail'].unique()
len(trails_scraped)
trails = trailforks_trails[~trailforks_trails['title'].isin(trails_scraped)]['title']
len(trails)
trails = trails[trails.str.contains('--')]
trails = trails.str.replace('--','-')
trails

22                               lounge-lower
122                              lounge-lower
220            hozomeen-trail-east-bank-trail
221        willow-creek-trail-east-lake-trail
223     lightning-creek-trail-east-lake-trail
                        ...                  
9455                canopy-trail--enduro-line
9476              black-mountain-trail-middle
9482                     spencer-branch-upper
9489                        spencer-gap-upper
9495               black-mountain-trail-lower
Name: title, Length: 890, dtype: object

Convert trail titles to have only words separated by -
This will be used in url for scraping checkins.

In [26]:
# north_carolina['title'] = north_carolina['title'].str.replace('\W', ' ').str.lower()
# north_carolina['title'] = north_carolina['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')
# washington['title'] = washington['title'].str.replace('\W', ' ').str.lower()
# washington['title'] = washington['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')

  north_carolina['title'] = north_carolina['title'].str.replace('\W', ' ').str.lower()
  north_carolina['title'] = north_carolina['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')
  washington['title'] = washington['title'].str.replace('\W', ' ').str.lower()
  washington['title'] = washington['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')


Getting trail stats for all the trails collected previously.

In [40]:
# This code can be used to scrape trails checkins and store them in dataframe.

import pandas as pd 

df = pd.DataFrame()
for trail in trails:
    df_trail = trailForksScrapper.fetchTrailStats(trail)
    if df_trail is not None:
        df_trail['trail'] = trail
        df = pd.concat([df,df_trail])

df

  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins'] = checkins_per_date['Check-Ins'].str.replace('\W', '')
  checkins_per_date['Check-Ins']

Unnamed: 0,Period,Check-Ins,trail
0,1am,31,monument-trail
1,2am,18,monument-trail
2,3am,37,monument-trail
3,4am,28,monument-trail
4,5am,6,monument-trail
...,...,...,...
2868,2022109,13,black-mountain-trail-lower
2869,20221010,5,black-mountain-trail-lower
2870,20221011,5,black-mountain-trail-lower
2871,20221012,39,black-mountain-trail-lower


## Merge all the tables to create one dataset

Cleaning up datasets:
- Removing trails with title unknown and NAN
- Removing unnamed columns
- filtering checkins greater than 0

In [7]:
trailforks_trails = trailforks_trails.dropna(subset='title')
trailforks_trails = trailforks_trails[~trailforks_trails['title'].str.contains('unknown')]
trailforks_trails = trailforks_trails[['title','riding area','rating','distance','descent','climb','popularity_score']]
trailforks_trails['title'] = trailforks_trails['title'].str.replace('--','-')
trailforks_trails

Unnamed: 0,title,riding area,rating,distance,descent,climb,popularity_score
3,iron-peak,Teanaway,,3 miles,-519 ft,"2,203 ft",0
4,frog-trail,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0
5,haida-s-trail,Pilchuck Tree Farm,,"2,924 ft",-191 ft,146 ft,0
6,stephanie-s-sweet-spot,Pilchuck Tree Farm,,"1,277 ft",-208 ft,26 ft,0
7,sandy-s-trail,Pilchuck Tree Farm,,908 ft,-54 ft,9 ft,0
...,...,...,...,...,...,...,...
9495,black-mountain-trail-lower,Pisgah Ranger District,,1 mile,-563 ft,,100
9496,hickory-mountain-loop,Dupont State Recreational Forest,,1 mile,-257 ft,261 ft,100
9497,the-jam,Rocky Knob Park,,682 ft,-20 ft,18 ft,100
9498,panda,U.S. National Whitewater Center,,1 mile,-20 ft,19 ft,100


In [55]:
wta_trails['title'] = wta_trails['TITLE']
wta_trails['title'] = wta_trails['title'].str.replace('\W', ' ').str.lower()
wta_trails['title'] = wta_trails['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')
wta_trails['title'] = wta_trails['title'].str.replace('--','-',regex=True) ## this is not working. Need to check why
#wta_trails = wta_trails.drop(columns=['Unnamed: 0'])
wta_trails

  wta_trails['title'] = wta_trails['title'].str.replace('\W', ' ').str.lower()
  wta_trails['title'] = wta_trails['title'].str.replace('[^a-z A-Z]', '').str.strip().str.replace(' ','-')


Unnamed: 0,TITLE,REGION,DISTANCE,DIST_TYPE,GAIN,HIGHEST,RATING,RATING_COUNT,LATITUDE,LONGITUDE,REPORT_DATE,REPORT_COUNT,URL,title
0,Raven Roost,Mount Rainier Area,,,,,2.75,4,47.008852,-121.115564,2018-10-07,6,https://www.wta.org/go-hiking/hikes/raven-roost,raven-roost
1,Ranger Hole - Interrorem Nature Trail,Olympic Peninsula,2.1,roundtrip,200.0,320.0,4.33,12,47.680685,-122.992312,2021-04-09,71,https://www.wta.org/go-hiking/hikes/ranger-hole,ranger-hole--interrorem-nature-trail
2,Rainbow Ridge,North Cascades,,,1700.0,5300.0,2.88,8,48.763431,-121.699677,2020-09-28,30,https://www.wta.org/go-hiking/hikes/rainbow-ridge,rainbow-ridge
3,Pyramid Mountain,Central Cascades,18.0,roundtrip,3000.0,8243.0,3.00,4,48.018056,-120.505278,2020-08-01,44,https://www.wta.org/go-hiking/hikes/pyramid-mo...,pyramid-mountain
4,Pilot Ridge,North Cascades,,,,,2.80,5,48.044842,-121.258825,2020-08-29,67,https://www.wta.org/go-hiking/hikes/pilot-ridge,pilot-ridge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,Sammamish Valley Park,Puget Sound and Islands,,,,,1.00,1,47.704390,-122.153077,2020-04-25,1,https://www.wta.org/go-hiking/hikes/sammamish-...,sammamish-valley-park
3086,Number Two Canyon Trails,Central Washington,8.0,trails,1600.0,,0.00,0,47.393766,-120.430387,2020-06-02,2,https://www.wta.org/go-hiking/hikes/number-two...,number-two-canyon-trails
3087,Enchanted Forest Trail,Puget Sound and Islands,,,,1080.0,0.00,0,48.988488,-123.041787,2019-05-26,1,https://www.wta.org/go-hiking/hikes/enchanted-...,enchanted-forest-trail
3088,Badger Mountain - The Langdon Trail,Central Washington,6.0,roundtrip,1300.0,1570.0,0.00,0,46.242064,-119.342436,2017-04-06,2,https://www.wta.org/go-hiking/hikes/badger-mou...,badger-mountain--the-langdon-trail


In [66]:
trails_checkins = trails_checkins[['Period','Check-Ins','trail']]
trails_checkins = trails_checkins[trails_checkins['Check-Ins']>0]
trails_checkins

Unnamed: 0,Period,Check-Ins,trail
0,1am,1,highland
1,9am,1,highland
2,10am,1,highland
3,11am,2,highland
4,12pm,3,highland
...,...,...,...
10709412,2022109,13,black-mountain-lower
10709413,20221010,5,black-mountain-lower
10709414,20221011,5,black-mountain-lower
10709415,20221012,39,black-mountain-lower


In [67]:
dataset_1 = trailforks_trails
dataset_2 = wta_trails
dataset_3 = trails_checkins[['Period','Check-Ins','trail']]
dataset_1['title'] = dataset_1['title'].str.replace('-trail','')
dataset_2['title'] = dataset_2['title'].str.replace('-trail','')
dataset_3['trail'] = dataset_3['trail'].str.replace('-trail','')
combined_trails = pd.merge(dataset_1,dataset_2,on='title',how='left')
combined_trails = pd.merge(combined_trails,dataset_3.set_index('trail'),left_on='title',right_on='trail',how='inner')
combined_trails

Unnamed: 0,title,riding area,rating,distance,descent,climb,popularity_score,TITLE,REGION,DISTANCE,...,HIGHEST,RATING,RATING_COUNT,LATITUDE,LONGITUDE,REPORT_DATE,REPORT_COUNT,URL,Period,Check-Ins
0,frog,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0,,,,...,,,,,,,,,1am,7
1,frog,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0,,,,...,,,,,,,,,2am,1
2,frog,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0,,,,...,,,,,,,,,4am,16
3,frog,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0,,,,...,,,,,,,,,5am,4
4,frog,Pilchuck Tree Farm,,485 ft,-9 ft,52 ft,0,,,,...,,,,,,,,,6am,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2072246,panda,U.S. National Whitewater Center,,1 mile,-20 ft,19 ft,100,,,,...,,,,,,,,,2022103,10
2072247,panda,U.S. National Whitewater Center,,1 mile,-20 ft,19 ft,100,,,,...,,,,,,,,,2022104,9
2072248,panda,U.S. National Whitewater Center,,1 mile,-20 ft,19 ft,100,,,,...,,,,,,,,,2022105,6
2072249,panda,U.S. National Whitewater Center,,1 mile,-20 ft,19 ft,100,,,,...,,,,,,,,,2022109,2


In [69]:
len(combined_trails['title'].unique())

2865