# Importing Dependencies

In [216]:
# import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import re
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [248]:
# Import the detailed trails data

detailed_trails_df = pd.read_csv('Resources/detailed_trails.csv')
detailed_trails_df.set_index('trail_id', inplace=True)
detailed_trails_df.head()


Unnamed: 0_level_0,name,park_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,...,False,False,False,False,False,False,False,False,False,False
10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,...,False,False,False,False,False,False,False,False,False,False
10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,...,False,False,False,False,False,False,False,False,False,False
10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,16.2674,3379.614,119.7864,1,loop,...,False,False,False,False,False,False,False,False,False,False
10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,12.5935,29772.79,1124.712,5,out and back,...,False,False,False,False,False,False,False,False,False,False


## Prep data for machine learning

In [249]:
# use encoding to codify "route_type"

RF_trails_df = pd.get_dummies(detailed_trails_df, columns=["route_type"])

# remove unneded columns for machine learning:
bad_cols = ["name", "park_name", "city_name", "state_name", "country_name"]

RF_trails_df = RF_trails_df.drop(columns=bad_cols, axis=1)

In [250]:
# Change Boolean values to numeric

# get list of Boolean columns
bool_cols = list(RF_trails_df.select_dtypes('bool'))
bool_cols

# replace boolean values with 1s and 0s
bool_trail_cols = RF_trails_df[bool_cols].replace({True: 1, False: 0})
bool_trail_cols

Unnamed: 0_level_0,dogs-no,forest,river,views,waterfall,wild-flowers,wildlife,partially-paved,lake,kids,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10236086,1,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10267857,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10236076,1,1,0,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
10236082,1,0,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008302,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10236001,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10258707,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10014989,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [251]:
# Remove original columns and merge with the new bool_trail_cols dataframe

RF_trails_df = RF_trails_df.drop(columns=bool_cols, axis=1)
RF_trails_df = RF_trails_df.merge(bool_trail_cols, how='left', on='trail_id')
RF_trails_df

Unnamed: 0_level_0,popularity,length,elevation_gain,difficulty_rating,avg_rating,num_reviews,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,24.8931,15610.598,1161.8976,5,5.0,423,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10236086,18.0311,6920.162,507.7968,3,4.5,260,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10267857,17.7821,2896.812,81.9912,1,4.5,224,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10236076,16.2674,3379.614,119.7864,1,4.5,237,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10236082,12.5935,29772.790,1124.7120,5,4.5,110,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008302,9.3861,20116.750,1105.8144,5,4.5,43,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10236001,9.1555,28324.384,1171.9560,5,5.0,22,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10258707,8.5066,321.868,3.9624,1,4.5,31,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10014989,8.3240,19312.080,1670.9136,5,4.0,8,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [252]:
RF_trails_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3313 entries, 10020048 to 10259465
Data columns (total 57 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   popularity                 3313 non-null   float64
 1   length                     3313 non-null   float64
 2   elevation_gain             3313 non-null   float64
 3   difficulty_rating          3313 non-null   int64  
 4   avg_rating                 3313 non-null   float64
 5   num_reviews                3313 non-null   int64  
 6   route_type_loop            3313 non-null   uint8  
 7   route_type_out and back    3313 non-null   uint8  
 8   route_type_point to point  3313 non-null   uint8  
 9   dogs-no                    3313 non-null   int64  
 10  forest                     3313 non-null   int64  
 11  river                      3313 non-null   int64  
 12  views                      3313 non-null   int64  
 13  waterfall                  3313 non-n

In [235]:
# What fields should we further exclude from the following RF model? I think we should get rid of popularity and number of reviews for this 
# I think we should still use avg_rating as the target variable (we will use Random forest model to find the features that most impact rating)

### Random Forest Model

In [253]:
# setting features and target variables
F=RF_trails_df.copy()
F=F.drop('avg_rating',axis=1)

y=RF_trails_df['avg_rating'].astype(str)
F.head()

Unnamed: 0_level_0,popularity,length,elevation_gain,difficulty_rating,num_reviews,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,forest,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,24.8931,15610.598,1161.8976,5,423,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
10236086,18.0311,6920.162,507.7968,3,260,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
10267857,17.7821,2896.812,81.9912,1,224,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10236076,16.2674,3379.614,119.7864,1,237,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
10236082,12.5935,29772.79,1124.712,5,110,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [254]:
# Describe F
F.describe()

Unnamed: 0,popularity,length,elevation_gain,difficulty_rating,num_reviews,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,forest,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,...,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,8.953441,17676.848717,641.805943,3.167824,70.341986,0.328101,0.575309,0.096589,0.731361,0.606399,...,0.016299,0.025053,0.014187,0.022034,0.003018,0.007848,0.001509,0.001509,0.000906,0.00332
std,8.138323,25497.37664,901.506642,1.702752,184.11837,0.469593,0.494371,0.295442,0.443318,0.488622,...,0.126644,0.156309,0.118277,0.146818,0.054865,0.088253,0.038825,0.038825,0.030083,0.057535
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.7941,4506.152,116.7384,1.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.5731,10621.644,359.9688,3.0,17.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,11.2556,21404.222,833.9328,5.0,57.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.6229,529794.728,14029.944,7.0,3903.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [255]:
# Check the balance of our target values
# Change target variable to string to avoid "ValueError: Unknown label type: 'continuous'"
y = y.astype(str)
y.value_counts()

4.5    1522
4.0     805
5.0     539
3.5     212
0.0     116
3.0      80
2.5      18
2.0      12
1.0       6
1.5       3
Name: avg_rating, dtype: int64

In [256]:
y

trail_id
10020048    5.0
10236086    4.5
10267857    4.5
10236076    4.5
10236082    4.5
           ... 
10008302    4.5
10236001    5.0
10258707    4.5
10014989    4.0
10259465    4.5
Name: avg_rating, Length: 3313, dtype: object

In [257]:
F = F.dropna(axis=1)

In [258]:
F.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3313 entries, 10020048 to 10259465
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   popularity                 3313 non-null   float64
 1   length                     3313 non-null   float64
 2   elevation_gain             3313 non-null   float64
 3   difficulty_rating          3313 non-null   int64  
 4   num_reviews                3313 non-null   int64  
 5   route_type_loop            3313 non-null   uint8  
 6   route_type_out and back    3313 non-null   uint8  
 7   route_type_point to point  3313 non-null   uint8  
 8   dogs-no                    3313 non-null   int64  
 9   forest                     3313 non-null   int64  
 10  river                      3313 non-null   int64  
 11  views                      3313 non-null   int64  
 12  waterfall                  3313 non-null   int64  
 13  wild-flowers               3313 non-n

In [259]:
# split into test and train
from sklearn.model_selection import train_test_split
F_train, F_test, y_train, y_test = train_test_split(F, y, random_state=1, stratify=y)

In [260]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
F_scaler = scaler.fit(F_train)

F_train_scaled = F_scaler.transform(F_train)
F_test_scaled = F_scaler.transform(F_test)

In [261]:
# !pip install scikit-learn==1.0 -U

In [262]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# fit the model
eec.fit(F_train, y_train)

In [263]:
# predict y
y_pred = eec.predict(F_test)

# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.3510535574656472

In [264]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 29,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0,   0,   0,   0],
       [  0,   2,   1,   0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   1,   0,   0,   1,   2,   0,   0,   0],
       [  0,   3,   1,   8,   0,   3,   3,   2,   0,   0],
       [  0,   2,   1,   2,   6,   5,  18,   9,  10,   0],
       [  0,   5,   2,  15,   1,   5,  41,  32,  92,   8],
       [  0,   2,   2,   7,   5,   4,  31,  61, 238,  31],
       [  0,   9,   6,  13,   1,   1,  13,  21,  39,  32]], dtype=int64)

In [265]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      1.00      1.00      1.00      1.00      1.00        29
        1.0       0.00      0.00      0.97      0.00      0.00      0.00         1
        1.5       0.07      1.00      0.98      0.12      0.99      0.98         1
        2.0       0.00      0.00      0.94      0.00      0.00      0.00         3
        2.5       0.00      0.00      0.98      0.00      0.00      0.00         5
        3.0       0.16      0.15      0.98      0.15      0.38      0.13        20
        3.5       0.17      0.34      0.88      0.22      0.55      0.28        53
        4.0       0.26      0.16      0.85      0.20      0.37      0.13       201
        4.5       0.63      0.62      0.69      0.63      0.65      0.43       381
        5.0       0.45      0.24      0.94      0.31      0.47      0.21       135

avg / total       0.47      0.43      0.80      0.44      0.55      0.32       829



### Random Forest

In [266]:
# create random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [267]:
# train the model 
rf_model = rf_model.fit(F_train_scaled, y_train)
# predict the y values
predictions = rf_model.predict(F_test_scaled)

In [268]:
# Sort the features by importance:
sorted(zip(rf_model.feature_importances_, F.columns), reverse=True)

[(0.16209759424188222, 'popularity'),
 (0.1567557823461834, 'num_reviews'),
 (0.11809001667387133, 'elevation_gain'),
 (0.1065427764740739, 'length'),
 (0.03228904710632562, 'difficulty_rating'),
 (0.023216535667552315, 'forest'),
 (0.022093155975409215, 'wild-flowers'),
 (0.021342223299115046, 'trail-running'),
 (0.021025552577171702, 'wildlife'),
 (0.02051047326507865, 'river'),
 (0.020471341150392706, 'birding'),
 (0.018251392342259976, 'nature-trips'),
 (0.017748355893885816, 'route_type_out and back'),
 (0.016824314224268163, 'walking'),
 (0.01681725149412974, 'lake'),
 (0.016758933369321754, 'route_type_loop'),
 (0.014806019992423187, 'camping'),
 (0.014627053280390647, 'dogs-no'),
 (0.01401170775102364, 'kids'),
 (0.013451053820900172, 'backpacking'),
 (0.013416788041314662, 'waterfall'),
 (0.011876198994967505, 'views'),
 (0.011483978330802008, 'dogs'),
 (0.01099436820267897, 'dogs-leash'),
 (0.009963250433212927, 'horseback-riding'),
 (0.008726566423831341, 'route_type_point t