# Importing Dependencies

In [6]:
# import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

## Exploring CSVs

In [7]:
# Load the popularity and biodiversity data
file_path = Path('Resources/parks_pop_bio.csv')
pop_bio_df = pd.read_csv(file_path)
pop_bio_df.head()

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity,species_count
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179,8.376,1709
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48,13.462,1048
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10,13.141,1389
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70,7.176,2269
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4,4.055,1726


In [8]:
# Load the trail csv
file_path = Path('Resources/parks_trails_final.csv')
trail_df = pd.read_csv(file_path)
trail_df.head()

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179.0,8.37592
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48.0,13.462423
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10.0,13.14104
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70.0,7.17638
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4.0,4.0552


In [37]:
# Load the csv with trails, bio, and pop

file_path = "./Resources/trails_bio_combined.csv"
bio_trails_df = pd.read_csv(file_path, low_memory=False)
bio_trails_df.tail()


Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,species_count,bio_category_count
3091,10008302,Silversword Loop Via Halemau'u Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.75275, 'lng': -156.22884}",9.3861,20116.75,1105.8144,5,loop,2.0,4.5,43,"['dogs-no', 'views', 'wild-flowers']","['birding', 'hiking', 'nature-trips']",m,2425,13
3092,10236001,Keonehe'ehe'e Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.714480000000002, 'lng': -156.25072}",9.1555,28324.384,1171.956,5,out and back,2.0,5.0,22,"['dogs-no', 'views', 'wildlife']","['backpacking', 'camping', 'hiking']",m,2425,13
3093,10258707,Red Hill Overlook Summit Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.71007, 'lng': -156.25357}",8.5066,321.868,3.9624,1,out and back,,4.5,31,"['dogs-no', 'kids', 'views']","['hiking', 'walking']",m,2425,13
3094,10014989,Kaupo Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.64981, 'lng': -156.137}",8.324,19312.08,1670.9136,5,out and back,1.0,4.0,8,"['dogs-no', 'views', 'wildlife']",['hiking'],m,2425,13
3095,10259465,Ka Lu'u o ka O'o Cinder Cone via Crater and Sl...,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.71449, 'lng': -156.25085}",2.4176,8368.568,510.8448,3,loop,2.0,4.5,45,['views'],['hiking'],m,2425,13


In [38]:
# get the number of features and activities
bio_trails_df['features_list'] = bio_trails_df['features'].str.len()
bio_trails_df['activities_list'] = bio_trails_df['activities'].str.len()
bio_trails_df.head()

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,...,visitor_usage,avg_rating,num_reviews,features,activities,units,species_count,bio_category_count,features_list,activities_list
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,...,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,1015,9,80,65
1,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,...,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,1015,9,51,21
2,10187810,Bright Angel Trail to Bright Angel Campground ...,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05735, 'lng': -112.14381}",37.4791,28485.318,1525.8288,...,3.0,5.0,670,"['dogs-no', 'river', 'views', 'wild-flowers', ...","['backpacking', 'camping', 'hiking', 'nature-t...",i,2604,10,57,52
3,10016964,South Kaibab Trail to Cedar Ridge,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05346, 'lng': -112.08361}",36.2709,4988.954,358.7496,...,3.0,5.0,489,"['cave', 'dogs-no', 'forest', 'partially-paved...","['birding', 'camping', 'hiking', 'nature-trips...",i,2604,10,85,79
4,10237812,Three-Mile Resthouse via Bright Angel Trail,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05701, 'lng': -112.14414}",33.256,8690.436,635.8128,...,3.0,4.5,454,"['dogs-no', 'river', 'views', 'wild-flowers', ...","['backpacking', 'birding', 'hiking', 'nature-t...",i,2604,10,57,52


In [39]:
bio_trails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3096 entries, 0 to 3095
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   trail_id            3096 non-null   int64  
 1   name                3096 non-null   object 
 2   park_name           3096 non-null   object 
 3   city_name           3096 non-null   object 
 4   state_name          3096 non-null   object 
 5   country_name        3096 non-null   object 
 6   _geoloc             3096 non-null   object 
 7   popularity          3096 non-null   float64
 8   length              3096 non-null   float64
 9   elevation_gain      3096 non-null   float64
 10  difficulty_rating   3096 non-null   int64  
 11  route_type          3096 non-null   object 
 12  visitor_usage       2861 non-null   float64
 13  avg_rating          3096 non-null   float64
 14  num_reviews         3096 non-null   int64  
 15  features            3096 non-null   object 
 16  activi

In [40]:
# drop nulls
bio_trails_df.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2861 entries, 0 to 3095
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   trail_id            2861 non-null   int64  
 1   name                2861 non-null   object 
 2   park_name           2861 non-null   object 
 3   city_name           2861 non-null   object 
 4   state_name          2861 non-null   object 
 5   country_name        2861 non-null   object 
 6   _geoloc             2861 non-null   object 
 7   popularity          2861 non-null   float64
 8   length              2861 non-null   float64
 9   elevation_gain      2861 non-null   float64
 10  difficulty_rating   2861 non-null   int64  
 11  route_type          2861 non-null   object 
 12  visitor_usage       2861 non-null   float64
 13  avg_rating          2861 non-null   float64
 14  num_reviews         2861 non-null   int64  
 15  features            2861 non-null   object 
 16  activi

In [41]:
# Find correlations
bio_trails_df.corr()

Unnamed: 0,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews,species_count,bio_category_count,features_list,activities_list
trail_id,1.0,-0.331849,0.058852,0.067561,0.061459,-0.143285,-0.059919,-0.191822,0.052723,0.04058,-0.233209,-0.137174
popularity,-0.331849,1.0,-0.156075,-0.117292,-0.059885,0.471927,0.288432,0.862041,-0.050053,-0.028675,0.362597,0.109853
length,0.058852,-0.156075,1.0,0.785841,0.339488,-0.070607,-0.077388,-0.117695,0.055403,0.00755,0.071711,0.137098
elevation_gain,0.067561,-0.117292,0.785841,1.0,0.548462,-0.042267,-0.013381,-0.094417,0.087568,-0.019047,0.076906,0.120493
difficulty_rating,0.061459,-0.059885,0.339488,0.548462,1.0,-0.105551,0.082256,-0.032098,0.084656,0.022073,-0.008679,0.062761
visitor_usage,-0.143285,0.471927,-0.070607,-0.042267,-0.105551,1.0,0.133708,0.407635,-0.056364,-0.060644,0.228743,0.032526
avg_rating,-0.059919,0.288432,-0.077388,-0.013381,0.082256,0.133708,1.0,0.149689,-0.052054,-0.015865,0.104556,0.03347
num_reviews,-0.191822,0.862041,-0.117695,-0.094417,-0.032098,0.407635,0.149689,1.0,-0.018194,-0.017118,0.269063,0.067774
species_count,0.052723,-0.050053,0.055403,0.087568,0.084656,-0.056364,-0.052054,-0.018194,1.0,0.656547,0.059259,0.102796
bio_category_count,0.04058,-0.028675,0.00755,-0.019047,0.022073,-0.060644,-0.015865,-0.017118,0.656547,1.0,0.073862,0.075488


In [45]:
#drop columns
updated_bt_df = bio_trails_df.drop(columns=['features','activities','_geoloc','units'], axis=1)
updated_bt_df.head()

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,species_count,bio_category_count,features_list,activities_list
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,1015,9,80,65
1,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,1015,9,51,21
2,10187810,Bright Angel Trail to Bright Angel Campground ...,Grand Canyon National Park,Grand Canyon,Arizona,United States,37.4791,28485.318,1525.8288,5,out and back,3.0,5.0,670,2604,10,57,52
3,10016964,South Kaibab Trail to Cedar Ridge,Grand Canyon National Park,Grand Canyon,Arizona,United States,36.2709,4988.954,358.7496,3,out and back,3.0,5.0,489,2604,10,85,79
4,10237812,Three-Mile Resthouse via Bright Angel Trail,Grand Canyon National Park,Grand Canyon,Arizona,United States,33.256,8690.436,635.8128,5,out and back,3.0,4.5,454,2604,10,57,52


In [46]:
# get list of object columns
list(updated_bt_df.select_dtypes('object'))

['name', 'park_name', 'city_name', 'state_name', 'country_name', 'route_type']

In [54]:
# define list for object columns
str_columns = ['name', 'park_name', 'city_name', 'state_name', 'country_name', 'route_type']

In [57]:
# get dummies
bio_trails_encoded = pd.get_dummies(updated_bt_df, columns=str_columns)
bio_trails_encoded

Unnamed: 0,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews,species_count,bio_category_count,...,state_name_Texas,state_name_Utah,state_name_Virginia,state_name_Washington,state_name_Wyoming,country_name_Hawaii,country_name_United States,route_type_loop,route_type_out and back,route_type_point to point
0,10020048,24.8931,15610.598,1161.8976,5,3.0,5.0,423,1015,9,...,0,0,0,0,0,0,1,0,1,0
1,10267857,17.7821,2896.812,81.9912,1,3.0,4.5,224,1015,9,...,0,0,0,0,0,0,1,0,1,0
2,10187810,37.4791,28485.318,1525.8288,5,3.0,5.0,670,2604,10,...,0,0,0,0,0,0,1,0,1,0
3,10016964,36.2709,4988.954,358.7496,3,3.0,5.0,489,2604,10,...,0,0,0,0,0,0,1,0,1,0
4,10237812,33.2560,8690.436,635.8128,5,3.0,4.5,454,2604,10,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,10008302,9.3861,20116.750,1105.8144,5,2.0,4.5,43,2425,13,...,0,0,0,0,0,1,0,1,0,0
3092,10236001,9.1555,28324.384,1171.9560,5,2.0,5.0,22,2425,13,...,0,0,0,0,0,1,0,0,1,0
3093,10258707,8.5066,321.868,3.9624,1,,4.5,31,2425,13,...,0,0,0,0,0,1,0,0,1,0
3094,10014989,8.3240,19312.080,1670.9136,5,1.0,4.0,8,2425,13,...,0,0,0,0,0,1,0,0,1,0


In [60]:
# check for nulls
bio_trails_encoded.dropna().reset_index()

Unnamed: 0,index,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews,species_count,...,state_name_Texas,state_name_Utah,state_name_Virginia,state_name_Washington,state_name_Wyoming,country_name_Hawaii,country_name_United States,route_type_loop,route_type_out and back,route_type_point to point
0,0,10020048,24.8931,15610.598,1161.8976,5,3.0,5.0,423,1015,...,0,0,0,0,0,0,1,0,1,0
1,1,10267857,17.7821,2896.812,81.9912,1,3.0,4.5,224,1015,...,0,0,0,0,0,0,1,0,1,0
2,2,10187810,37.4791,28485.318,1525.8288,5,3.0,5.0,670,2604,...,0,0,0,0,0,0,1,0,1,0
3,3,10016964,36.2709,4988.954,358.7496,3,3.0,5.0,489,2604,...,0,0,0,0,0,0,1,0,1,0
4,4,10237812,33.2560,8690.436,635.8128,5,3.0,4.5,454,2604,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2856,3090,10236006,9.5362,804.670,32.9184,1,2.0,4.5,39,2425,...,0,0,0,0,0,1,0,0,1,0
2857,3091,10008302,9.3861,20116.750,1105.8144,5,2.0,4.5,43,2425,...,0,0,0,0,0,1,0,1,0,0
2858,3092,10236001,9.1555,28324.384,1171.9560,5,2.0,5.0,22,2425,...,0,0,0,0,0,1,0,0,1,0
2859,3094,10014989,8.3240,19312.080,1670.9136,5,1.0,4.0,8,2425,...,0,0,0,0,0,1,0,0,1,0


In [62]:
# Create our features
X = bio_trails_encoded.copy()


# Create our target
y = bio_trails_encoded['popularity']
X.head()

Unnamed: 0,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews,species_count,bio_category_count,...,state_name_Texas,state_name_Utah,state_name_Virginia,state_name_Washington,state_name_Wyoming,country_name_Hawaii,country_name_United States,route_type_loop,route_type_out and back,route_type_point to point
0,10020048,24.8931,15610.598,1161.8976,5,3.0,5.0,423,1015,9,...,0,0,0,0,0,0,1,0,1,0
1,10267857,17.7821,2896.812,81.9912,1,3.0,4.5,224,1015,9,...,0,0,0,0,0,0,1,0,1,0
2,10187810,37.4791,28485.318,1525.8288,5,3.0,5.0,670,2604,10,...,0,0,0,0,0,0,1,0,1,0
3,10016964,36.2709,4988.954,358.7496,3,3.0,5.0,489,2604,10,...,0,0,0,0,0,0,1,0,1,0
4,10237812,33.256,8690.436,635.8128,5,3.0,4.5,454,2604,10,...,0,0,0,0,0,0,1,0,1,0


In [63]:
X.describe()

Unnamed: 0,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews,species_count,bio_category_count,...,state_name_Texas,state_name_Utah,state_name_Virginia,state_name_Washington,state_name_Wyoming,country_name_Hawaii,country_name_United States,route_type_loop,route_type_out and back,route_type_point to point
count,3096.0,3096.0,3096.0,3096.0,3096.0,2861.0,3096.0,3096.0,3096.0,3096.0,...,3096.0,3096.0,3096.0,3096.0,3096.0,3096.0,3096.0,3096.0,3096.0,3096.0
mean,10185010.0,9.005475,17239.846825,611.826438,3.163437,1.882559,4.175388,72.209302,2793.709625,10.32655,...,0.030362,0.094315,0.060401,0.103359,0.100452,0.015827,0.984173,0.325904,0.575258,0.098837
std,149897.9,8.213758,24727.732029,834.983126,1.687864,0.697282,0.933796,188.897394,1469.109761,3.267446,...,0.171608,0.292314,0.238266,0.304477,0.300651,0.124826,0.124826,0.468787,0.494384,0.298492
min,10000010.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,779.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10028290.0,3.823275,4506.152,118.6434,1.0,1.0,4.0,6.0,1712.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,10238920.0,6.60045,10621.644,359.3592,3.0,2.0,4.5,18.0,2258.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,10291470.0,11.2803,20921.42,810.768,5.0,2.0,4.5,59.0,3900.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,10545420.0,84.6229,529794.728,14029.944,7.0,4.0,5.0,3903.0,6268.0,14.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
# Check the balance of our target values
y.value_counts()

0.0000    4
0.3724    2
3.9424    2
4.0631    2
3.8410    2
         ..
5.9372    1
5.7860    1
5.7852    1
5.7394    1
2.4176    1
Name: popularity, Length: 3075, dtype: int64