In [1]:
import pandas as pd
import numpy as np
import csv

# Parks & Trails Data

### Parks Data

In [2]:
# Load the parks data from the parks.csv

parks_data_to_load = "./Resources/parks.csv"
parks_df = pd.read_csv(parks_data_to_load, encoding="utf-8")
parks_df.head()

Unnamed: 0,Park Code,Park Name,State,Acres,Latitude,Longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08


In [3]:
#Standardize the column names to lowercase & underscores:

renamed_parks_df = parks_df.rename(columns={"Park Code": "park_code",
                                        "Park Name": "park_name",
                                        "State": "state",
                                        "Acres": "acres",
                                        "Latitude": "latitude",
                                        "Longitude": "longitude",
                                                                    })
renamed_parks_df

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44


In [4]:
# Remove any words after the work "Park" in park_names:

p = '(.*Park)'
park_name_parks = renamed_parks_df["park_name"].str.extract(p)
park_name_parks.value_counts()

Acadia National Park                          1
Arches National Park                          1
Isle Royale National Park                     1
Joshua Tree National Park                     1
Katmai National Park                          1
Kenai Fjords National Park                    1
Kobuk Valley National Park                    1
Lake Clark National Park                      1
Lassen Volcanic National Park                 1
Mammoth Cave National Park                    1
Mesa Verde National Park                      1
Mount Rainier National Park                   1
North Cascades National Park                  1
Olympic National Park                         1
Petrified Forest National Park                1
Pinnacles National Park                       1
Redwood National Park                         1
Rocky Mountain National Park                  1
Saguaro National Park                         1
Sequoia and Kings Canyon National Park        1
Shenandoah National Park                

In [5]:
# Final clean of parks_df - columns standardized, and anything after "parks" is removed:
# Edited Park Names are replace original Park Names
renamed_parks_df["park_name"] = park_name_parks.values
parks_final_df = renamed_parks_df

#Check the final df and make sure its all there!
parks_final_df

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44


In [6]:
# Write parks final df to CSV:

merged_data_to_load = "./Resources/parks_final.csv"
parks_final_df.to_csv(merged_data_to_load, index=False)

#### NOTE: parks_final_df is clean and ready to merge with trails data

### Trails Data

In [7]:
# Load the trail data from trail_data.csv:

trails_data_to_load = "./Resources/trail_data.csv"
trails_df = pd.read_csv(trails_data_to_load, low_memory=False, encoding="utf-8")
trails_df.head(5)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.79,1124.712,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i


In [8]:
#Standardize the column names to lowercase & underscores:

renamed_trails_df = trails_df.rename(columns={"area_name": "park_name"})
renamed_trails_df.head()

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.79,1124.712,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i


In [9]:
# Remove any words after the work "Park" in park_names
p = '(.*Park)'
park_name_trails = renamed_trails_df["park_name"].str.extract(p)

#Checking the regex function
park_name_trails.value_counts()

Great Smoky Mountains National Park           293
Yosemite National Park                        242
Yellowstone National Park                     228
Rocky Mountain National Park                  207
Shenandoah National Park                      187
Acadia National Park                          179
Olympic National Park                         170
Glacier National Park                         132
Mount Rainier National Park                   130
Joshua Tree National Park                     108
Grand Teton National Park                     102
Grand Canyon National Park                     99
Zion National Park                             94
Sequoia National Park                          90
Death Valley National Park                     79
Canyonlands National Park                      77
Big Bend National Park                         70
Kings Canyon National Park                     61
Saguaro National Park                          60
Lassen Volcanic National Park                  59


In [10]:
# Park Names have been trimmed and are moved inplace of the original park names 
# A new dataframe is named (renamed_trails_df)

renamed_trails_df["park_name"] = park_name_trails.values
renamed_trails_df

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.790,1124.7120,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3308,10008302,Silversword Loop Via Halemau'u Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.75275, 'lng': -156.22884}",9.3861,20116.750,1105.8144,5,loop,2.0,4.5,43,"['dogs-no', 'views', 'wild-flowers']","['birding', 'hiking', 'nature-trips']",m
3309,10236001,Keonehe'ehe'e Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.714480000000002, 'lng': -156.25072}",9.1555,28324.384,1171.9560,5,out and back,2.0,5.0,22,"['dogs-no', 'views', 'wildlife']","['backpacking', 'camping', 'hiking']",m
3310,10258707,Red Hill Overlook Summit Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.71007, 'lng': -156.25357}",8.5066,321.868,3.9624,1,out and back,,4.5,31,"['dogs-no', 'kids', 'views']","['hiking', 'walking']",m
3311,10014989,Kaupo Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.64981, 'lng': -156.137}",8.3240,19312.080,1670.9136,5,out and back,1.0,4.0,8,"['dogs-no', 'views', 'wildlife']",['hiking'],m


In [11]:
# Use pd.replace to replace "Sequoia National Park" and "Kings Canyon National Park"
# with "Sequoia and Kings Canyon National Park":

renamed_trails_df['park_name'] = renamed_trails_df['park_name'].replace(['Kings Canyon National Park','Sequoia National Park'],'Sequoia and Kings Canyon National Park')
trails_final_df = renamed_trails_df
trails_final_df

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.790,1124.7120,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3308,10008302,Silversword Loop Via Halemau'u Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.75275, 'lng': -156.22884}",9.3861,20116.750,1105.8144,5,loop,2.0,4.5,43,"['dogs-no', 'views', 'wild-flowers']","['birding', 'hiking', 'nature-trips']",m
3309,10236001,Keonehe'ehe'e Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.714480000000002, 'lng': -156.25072}",9.1555,28324.384,1171.9560,5,out and back,2.0,5.0,22,"['dogs-no', 'views', 'wildlife']","['backpacking', 'camping', 'hiking']",m
3310,10258707,Red Hill Overlook Summit Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.71007, 'lng': -156.25357}",8.5066,321.868,3.9624,1,out and back,,4.5,31,"['dogs-no', 'kids', 'views']","['hiking', 'walking']",m
3311,10014989,Kaupo Trail,Haleakala National Park,Kula,Maui,Hawaii,"{'lat': 20.64981, 'lng': -156.137}",8.3240,19312.080,1670.9136,5,out and back,1.0,4.0,8,"['dogs-no', 'views', 'wildlife']",['hiking'],m


In [12]:
# Write trails final df to CSV:

merged_data_to_load = "./Resources/trails_final.csv"
trails_final_df.to_csv(merged_data_to_load, index=False)

In [13]:
# Create a value for the number of trails per park
# Sequoia and Kings Canyon National Park should equal 90 + 61):

trail_count = trails_final_df.groupby(["park_name"]).count()["trail_id"].sort_values(ascending=False)
trail_count

park_name
Great Smoky Mountains National Park           293
Yosemite National Park                        242
Yellowstone National Park                     228
Rocky Mountain National Park                  207
Shenandoah National Park                      187
Acadia National Park                          179
Olympic National Park                         170
Sequoia and Kings Canyon National Park        151
Glacier National Park                         132
Mount Rainier National Park                   130
Joshua Tree National Park                     108
Grand Teton National Park                     102
Grand Canyon National Park                     99
Zion National Park                             94
Death Valley National Park                     79
Canyonlands National Park                      77
Big Bend National Park                         70
Saguaro National Park                          60
Lassen Volcanic National Park                  59
Cuyahoga Valley National Park           

In [14]:
# Create a value for the average popularity score for all trails within a certain park:

trail_pop_avg = trails_final_df.groupby(["park_name"]).mean()["popularity"].sort_values(ascending=False)
print(trail_pop_avg)

park_name
Kenai Fjords National Park                    21.337600
Bryce Canyon National Park                    16.617903
Haleakala National Park                       13.723586
Arches National Park                          13.462423
Badlands National Park                        13.141040
Rocky Mountain National Park                  13.007846
Crater Lake National Park                     12.430750
Zion National Park                            12.027557
Mount Rainier National Park                   11.726444
Glacier National Park                         10.061925
Grand Canyon National Park                     9.716001
Pinnacles National Park                        9.421904
Olympic National Park                          9.389508
North Cascades National Park                   9.374660
Grand Teton National Park                      9.373725
Sequoia and Kings Canyon National Park         9.243440
Yosemite National Park                         9.235686
Capitol Reef National Park            

In [15]:
# Create a new df that contains the count of trails and the average poplarity of each park:

trail_count_pop_df = pd.DataFrame({
    "trail_count": trail_count,
    "avg_popularity": trail_pop_avg,
})

#Reset the index so the park name is no longer the index:

trail_count_pop_df.reset_index(inplace=True)
trail_count_pop_df = trail_count_pop_df.rename(columns = {'index':'park_name'})

trail_count_pop_df

Unnamed: 0,park_name,trail_count,avg_popularity
0,Acadia National Park,179,8.37592
1,Arches National Park,48,13.462423
2,Badlands National Park,10,13.14104
3,Big Bend National Park,70,7.17638
4,Biscayne National Park,4,4.0552
5,Black Canyon of the Gunnison National Park,24,7.690121
6,Bryce Canyon National Park,31,16.617903
7,Canyonlands National Park,77,8.143584
8,Capitol Reef National Park,42,9.006231
9,Carlsbad Caverns National Park,8,5.329462


In [16]:
# Write trail count pop df to CSV:

merged_data_to_load = "./Resources/trail_count_pop.csv"
trail_count_pop_df.to_csv(merged_data_to_load, index=False)

#### NOTE: trail_count_pop_df is clean and ready to merge with parks_final_df

### Merging Parks and Trail Popularity DataFrames

In [17]:
#Create a dataframe that populates the parks_df with trail count and popularity data from the trails df:
merged_parks_df = pd.merge(parks_final_df, trail_count_pop_df, on="park_name", how="left")
pd.options.display.max_rows = 100
merged_parks_df

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179.0,8.37592
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48.0,13.462423
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10.0,13.14104
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70.0,7.17638
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4.0,4.0552
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72,24.0,7.690121
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18,31.0,16.617903
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93,77.0,8.143584
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17,42.0,9.006231
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44,8.0,5.329462


In [18]:
#Check the rows with NaNs in the trails data columns:

merged_parks_df.isna().sum()

park_code         0
park_name         0
state             0
acres             0
latitude          0
longitude         0
trail_count       4
avg_popularity    4
dtype: int64

In [19]:
#Drop the rows with NaNs in the trails data columns:

parks_trails_final_df = merged_parks_df.dropna(axis=0)
parks_trails_final_df

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179.0,8.37592
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48.0,13.462423
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10.0,13.14104
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70.0,7.17638
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4.0,4.0552
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72,24.0,7.690121
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18,31.0,16.617903
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93,77.0,8.143584
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17,42.0,9.006231
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44,8.0,5.329462


In [20]:
# Write final df to CSV:

merged_data_to_load = "./Resources/parks_trails_final.csv"
parks_trails_final_df.to_csv(merged_data_to_load, index=False)