# AllTrails National Park Trail Analysis

# Table of Contents:

## 01. Import Libraries
## 02. Import Data
## 03. Data Cleaning and Consistency Checks 
- Convert the length column from meters to miles
- Convert the elevation column from meters to feet
- Split geolocation to two columns 1) Longitude 2) Latitude
- Create difficulty rating Definition 
- Create new difficulty rating column 
- Calcuate number of features and actitivies and create new columns to display 
- Remove combined geolocation column 
- Check the dataset for null values
- Check the dataset for duplicates
- Check the dataset for mixed data types
- Check details of cleaned dataframe

## 04. Export Cleaned Dataframe


# 01. Import Libraries 

In [157]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Import Data

In [158]:
# set path
path=r'/Users/aylaarreguin/10-04-2022 AllTrails National Park Trail Analysis/'

In [159]:
path

'/Users/aylaarreguin/10-04-2022 AllTrails National Park Trail Analysis/'

In [160]:
# Import Data
df=pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'AllTrails NP Data.csv'), index_col = False)

In [161]:
#Exploring the data- First 5 Rows
df.head()

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.79,1124.712,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i


In [162]:
df.shape

(3313, 18)

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3313 entries, 0 to 3312
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   trail_id           3313 non-null   int64  
 1   name               3313 non-null   object 
 2   area_name          3313 non-null   object 
 3   city_name          3313 non-null   object 
 4   state_name         3313 non-null   object 
 5   country_name       3313 non-null   object 
 6   _geoloc            3313 non-null   object 
 7   popularity         3313 non-null   float64
 8   length             3313 non-null   float64
 9   elevation_gain     3313 non-null   float64
 10  difficulty_rating  3313 non-null   int64  
 11  route_type         3313 non-null   object 
 12  visitor_usage      3060 non-null   float64
 13  avg_rating         3313 non-null   float64
 14  num_reviews        3313 non-null   int64  
 15  features           3313 non-null   object 
 16  activities         3313 

In [164]:
df.describe()

Unnamed: 0,trail_id,popularity,length,elevation_gain,difficulty_rating,visitor_usage,avg_rating,num_reviews
count,3313.0,3313.0,3313.0,3313.0,3313.0,3060.0,3313.0,3313.0
mean,10185060.0,8.953441,17676.848717,641.805943,3.167824,1.877124,4.173106,70.341986
std,150324.8,8.138323,25497.37664,901.506642,1.702752,0.693641,0.947039,184.11837
min,10000010.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,10028530.0,3.7941,4506.152,116.7384,1.0,1.0,4.0,5.0
50%,10237810.0,6.5731,10621.644,359.9688,3.0,2.0,4.5,17.0
75%,10292340.0,11.2556,21404.222,833.9328,5.0,2.0,4.5,57.0
max,10545420.0,84.6229,529794.728,14029.944,7.0,4.0,5.0,3903.0


# 02. Data Cleaning

In [165]:
# Create a new column to convert the length column from meters to miles and the elevation column from meters to feet
df['length_miles']=df['length'].apply(lambda x : x*0.000621371)
df['elevation_gain_feet']=df['elevation_gain'].apply(lambda x : x*3.28084)

In [166]:
#drop original length column and elevation_gain column 
df.drop(['length'], axis = 1, inplace = True)
df.drop(['elevation_gain'], axis =1, inplace = True)

In [167]:
#Split _geoloc to two columns 1) latitude and longtitude
## Create two new columns
df[['lat','lng']] = df['_geoloc'].apply(lambda x: pd.Series(str(x).split(",")))
df['lat'] = df['lat'].apply(lambda x: (x.split(':')[1].split()[-1])).astype(float)
df['lng'] = df['lng'].apply(lambda x: (x.split(':')[1].split()[-1][:-1])).astype(float)

In [168]:
# Rename the lat and lng column for easier readability 
df.rename(columns = {'lat' : 'latitude', 'lng' : 'longitude'}, inplace = True)

In [169]:
#remove geoloc variables
df.drop(['_geoloc'],axis = 1, inplace = True)

In [170]:
#Create difficulty_rating definition
def definition(difficulty_rate):
    
    if difficulty_rate == 1:
        return 'easy'
    elif difficulty_rate == 3:
        return 'moderate'
    elif difficulty_rate == 5:
        return 'hard'
    else:
        return 'hard'

In [171]:
# Create a new column of the difficulty rating definition
df['difficulty'] = df.apply(lambda x: definition(difficulty_rate = x['difficulty_rating']), axis = 1)

In [172]:
#Calcuate the number of features & actitivies based on the number listed in the feature and activities column
## Create two new columns with calculation 
df['features_count'] = df['features'].apply(lambda x: len(x.split(',')))
df['activities_count'] = df['activities'].apply(lambda x: len(x.split(',')))

In [173]:
# Check df
df.head()

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,difficulty_rating,route_type,visitor_usage,...,features,activities,units,length_miles,elevation_gain_feet,latitude,longitude,difficulty,features_count,activities_count
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,5,out and back,3.0,...,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,9.699973,3812.000122,60.18852,-149.63156,hard,7,5
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,3,out and back,1.0,...,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i,4.299988,1666.000053,63.73049,-148.91968,moderate,5,5
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,1,out and back,3.0,...,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,1.799995,269.000009,60.18879,-149.631,easy,4,2
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,16.2674,1,loop,2.0,...,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i,2.099994,393.000013,63.73661,-148.915,easy,7,5
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,12.5935,5,out and back,1.0,...,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i,18.499948,3690.000118,63.73319,-148.89682,hard,5,5


With research, I was unable to find the metedata for the visitor usage column. I believe this was an old feature, since the dataset was extracted from All-Trials API. My guess is that it is not updated anymore, therefore, many trails have a null value. This feature is not important for my analysis so I will drop this column.

In [174]:
#drop visitor_usage column 
df.drop(['visitor_usage'],axis = 1, inplace = True)

In [175]:
#drop units column because it has no usable data for my analysis 
df.drop(['units'],axis = 1, inplace = True)

In [176]:
#Review and check the dataset
df.isnull().sum()

trail_id               0
name                   0
area_name              0
city_name              0
state_name             0
country_name           0
popularity             0
difficulty_rating      0
route_type             0
avg_rating             0
num_reviews            0
features               0
activities             0
length_miles           0
elevation_gain_feet    0
latitude               0
longitude              0
difficulty             0
features_count         0
activities_count       0
dtype: int64

In [177]:
#Checking for duplicates
duplicate = df[df.duplicated()]

In [178]:
duplicate

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,difficulty_rating,route_type,avg_rating,num_reviews,features,activities,length_miles,elevation_gain_feet,latitude,longitude,difficulty,features_count,activities_count


In [179]:
# check for mixed data type
for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

In [180]:
# Check details of cleaned dataframe 
df.shape

(3313, 20)

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3313 entries, 0 to 3312
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   trail_id             3313 non-null   int64  
 1   name                 3313 non-null   object 
 2   area_name            3313 non-null   object 
 3   city_name            3313 non-null   object 
 4   state_name           3313 non-null   object 
 5   country_name         3313 non-null   object 
 6   popularity           3313 non-null   float64
 7   difficulty_rating    3313 non-null   int64  
 8   route_type           3313 non-null   object 
 9   avg_rating           3313 non-null   float64
 10  num_reviews          3313 non-null   int64  
 11  features             3313 non-null   object 
 12  activities           3313 non-null   object 
 13  length_miles         3313 non-null   float64
 14  elevation_gain_feet  3313 non-null   float64
 15  latitude             3313 non-null   f

In [182]:
df.describe()

Unnamed: 0,trail_id,popularity,difficulty_rating,avg_rating,num_reviews,length_miles,elevation_gain_feet,latitude,longitude,features_count,activities_count
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,10185060.0,8.953441,3.167824,4.173106,70.341986,10.983881,2105.662609,39.67152,-106.38817,4.930878,4.012979
std,150324.8,8.138323,1.702752,0.947039,184.11837,15.84333,2957.699051,5.870789,17.75683,1.868662,1.687411
min,10000010.0,0.0,1.0,0.0,0.0,0.0,0.0,19.08609,-156.25357,1.0,1.0
25%,10028530.0,3.7941,1.0,4.0,5.0,2.799992,383.000012,36.33558,-118.77161,4.0,3.0
50%,10237810.0,6.5731,3.0,4.5,17.0,6.599982,1181.000038,38.46838,-110.79927,5.0,4.0
75%,10292340.0,11.2556,5.0,4.5,57.0,13.299963,2736.000088,44.32042,-101.92792,6.0,5.0
max,10545420.0,84.6229,7.0,5.0,3903.0,329.19908,46030.001473,63.88374,-68.03679,12.0,12.0


In [183]:
df.head()

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,difficulty_rating,route_type,avg_rating,num_reviews,features,activities,length_miles,elevation_gain_feet,latitude,longitude,difficulty,features_count,activities_count
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,5,out and back,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",9.699973,3812.000122,60.18852,-149.63156,hard,7,5
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,3,out and back,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",4.299988,1666.000053,63.73049,-148.91968,moderate,5,5
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,1,out and back,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",1.799995,269.000009,60.18879,-149.631,easy,4,2
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,16.2674,1,loop,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",2.099994,393.000013,63.73661,-148.915,easy,7,5
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,12.5935,5,out and back,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",18.499948,3690.000118,63.73319,-148.89682,hard,5,5


# Export Cleaned Dataframe

In [185]:
# Export cleaned dataframe 
df.to_csv(os.path.join(path,'Data', 'Prepared Data','AllTrails_cleaned.csv'))