# Importing Dependencies

In [1]:
# import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import re
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
# Import the detailed trails data

detailed_trails_df = pd.read_csv('Resources/detailed_trails.csv')
detailed_trails_df.set_index('trail_id', inplace=True)
detailed_trails_df.head()


Unnamed: 0_level_0,name,park_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,...,False,False,False,False,False,False,False,False,False,False
10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,...,False,False,False,False,False,False,False,False,False,False
10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,...,False,False,False,False,False,False,False,False,False,False
10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,16.2674,3379.614,119.7864,1,loop,...,False,False,False,False,False,False,False,False,False,False
10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,12.5935,29772.79,1124.712,5,out and back,...,False,False,False,False,False,False,False,False,False,False


## Prep data for machine learning

In [3]:
# use encoding to codify "route_type"

RF_trails_df = pd.get_dummies(detailed_trails_df, columns=["route_type"])

# remove unneded columns for machine learning:
bad_cols = ["name", "park_name", "city_name", "state_name", "country_name"]

RF_trails_df = RF_trails_df.drop(columns=bad_cols, axis=1)

In [4]:
# Change Boolean values to numeric

# get list of Boolean columns
bool_cols = list(RF_trails_df.select_dtypes('bool'))
bool_cols

# replace boolean values with 1s and 0s
bool_trail_cols = RF_trails_df[bool_cols].replace({True: 1, False: 0})
bool_trail_cols

Unnamed: 0_level_0,dogs-no,forest,river,views,waterfall,wild-flowers,wildlife,partially-paved,lake,kids,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10236086,1,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10267857,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10236076,1,1,0,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
10236082,1,0,0,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008302,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10236001,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10258707,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10014989,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Remove original columns and merge with the new bool_trail_cols dataframe

RF_trails_df = RF_trails_df.drop(columns=bool_cols, axis=1)
RF_trails_df = RF_trails_df.drop(['popularity','num_reviews'], axis=1)
RF_trails_df = RF_trails_df.merge(bool_trail_cols, how='left', on='trail_id')
RF_trails_df

Unnamed: 0_level_0,length,elevation_gain,difficulty_rating,avg_rating,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,forest,river,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,15610.598,1161.8976,5,5.0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10236086,6920.162,507.7968,3,4.5,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
10267857,2896.812,81.9912,1,4.5,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10236076,3379.614,119.7864,1,4.5,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
10236082,29772.790,1124.7120,5,4.5,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008302,20116.750,1105.8144,5,4.5,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10236001,28324.384,1171.9560,5,5.0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10258707,321.868,3.9624,1,4.5,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10014989,19312.080,1670.9136,5,4.0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
RF_trails_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3313 entries, 10020048 to 10259465
Data columns (total 55 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   length                     3313 non-null   float64
 1   elevation_gain             3313 non-null   float64
 2   difficulty_rating          3313 non-null   int64  
 3   avg_rating                 3313 non-null   float64
 4   route_type_loop            3313 non-null   uint8  
 5   route_type_out and back    3313 non-null   uint8  
 6   route_type_point to point  3313 non-null   uint8  
 7   dogs-no                    3313 non-null   int64  
 8   forest                     3313 non-null   int64  
 9   river                      3313 non-null   int64  
 10  views                      3313 non-null   int64  
 11  waterfall                  3313 non-null   int64  
 12  wild-flowers               3313 non-null   int64  
 13  wildlife                   3313 non-n

In [7]:
# What fields should we further exclude from the following RF model? I think we should get rid of popularity and number of reviews for this 
# I think we should still use avg_rating as the target variable (we will use Random forest model to find the features that most impact rating)

### Random Forest Model

In [8]:
# setting features and target variables
F=RF_trails_df.copy()
F=F.drop('avg_rating',axis=1)

y=RF_trails_df['avg_rating'].astype(str)
F.head()

Unnamed: 0_level_0,length,elevation_gain,difficulty_rating,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,forest,river,views,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10020048,15610.598,1161.8976,5,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10236086,6920.162,507.7968,3,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10267857,2896.812,81.9912,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10236076,3379.614,119.7864,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10236082,29772.79,1124.712,5,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Describe F
F.describe()

Unnamed: 0,length,elevation_gain,difficulty_rating,route_type_loop,route_type_out and back,route_type_point to point,dogs-no,forest,river,views,...,cross-country-skiing,fly-fishing,paddle-sports,skiing,bike-touring,whitewater-kayaking,rails-trails_y,ice-climbing,surfing,snowboarding
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,...,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,17676.848717,641.805943,3.167824,0.328101,0.575309,0.096589,0.731361,0.606399,0.29188,0.897676,...,0.016299,0.025053,0.014187,0.022034,0.003018,0.007848,0.001509,0.001509,0.000906,0.00332
std,25497.37664,901.506642,1.702752,0.469593,0.494371,0.295442,0.443318,0.488622,0.454696,0.30312,...,0.126644,0.156309,0.118277,0.146818,0.054865,0.088253,0.038825,0.038825,0.030083,0.057535
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4506.152,116.7384,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10621.644,359.9688,3.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,21404.222,833.9328,5.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,529794.728,14029.944,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Check the balance of our target values
# Change target variable to string to avoid "ValueError: Unknown label type: 'continuous'"
y = y.astype(str)
y.value_counts()

4.5    1522
4.0     805
5.0     539
3.5     212
0.0     116
3.0      80
2.5      18
2.0      12
1.0       6
1.5       3
Name: avg_rating, dtype: int64

In [11]:
y

trail_id
10020048    5.0
10236086    4.5
10267857    4.5
10236076    4.5
10236082    4.5
           ... 
10008302    4.5
10236001    5.0
10258707    4.5
10014989    4.0
10259465    4.5
Name: avg_rating, Length: 3313, dtype: object

In [12]:
F = F.dropna(axis=1)

In [13]:
F.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3313 entries, 10020048 to 10259465
Data columns (total 54 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   length                     3313 non-null   float64
 1   elevation_gain             3313 non-null   float64
 2   difficulty_rating          3313 non-null   int64  
 3   route_type_loop            3313 non-null   uint8  
 4   route_type_out and back    3313 non-null   uint8  
 5   route_type_point to point  3313 non-null   uint8  
 6   dogs-no                    3313 non-null   int64  
 7   forest                     3313 non-null   int64  
 8   river                      3313 non-null   int64  
 9   views                      3313 non-null   int64  
 10  waterfall                  3313 non-null   int64  
 11  wild-flowers               3313 non-null   int64  
 12  wildlife                   3313 non-null   int64  
 13  partially-paved            3313 non-n

In [14]:
# split into test and train
from sklearn.model_selection import train_test_split
F_train, F_test, y_train, y_test = train_test_split(F, y, random_state=1, stratify=y)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
F_scaler = scaler.fit(F_train)

F_train_scaled = F_scaler.transform(F_train)
F_test_scaled = F_scaler.transform(F_test)

In [16]:
# !pip install scikit-learn==1.0 -U

In [17]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# fit the model
eec.fit(F_train, y_train)

In [18]:
# predict y
y_pred = eec.predict(F_test)

# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.08130306073581565

In [19]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 3,  0,  1,  2,  3,  0,  1,  0,  3, 16],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  1,  0,  0,  0,  1,  0,  1,  0],
       [ 0,  1,  4,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  0,  5,  2,  2,  0,  0,  0,  1,  5],
       [ 4,  1, 20, 11,  4,  1,  1,  1,  9,  1],
       [ 8,  5, 80, 37, 14,  3,  3,  1, 26, 24],
       [21, 12, 90, 67, 28,  4,  6, 10, 75, 68],
       [12,  2,  5, 14, 12,  1,  0,  5, 18, 66]], dtype=int64)

In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.06      0.10      0.94      0.07      0.31      0.09        29
        1.0       0.00      0.00      0.97      0.00      0.00      0.00         1
        1.5       0.00      0.00      0.75      0.00      0.00      0.00         1
        2.0       0.00      0.00      0.84      0.00      0.00      0.00         3
        2.5       0.00      0.00      0.92      0.00      0.00      0.00         5
        3.0       0.00      0.00      0.99      0.00      0.00      0.00        20
        3.5       0.08      0.02      0.99      0.03      0.14      0.02        53
        4.0       0.06      0.00      0.97      0.01      0.07      0.00       201
        4.5       0.56      0.20      0.87      0.29      0.41      0.16       381
        5.0       0.37      0.49      0.84      0.42      0.64      0.39       135

avg / total       0.34      0.18      0.90      0.21      0.33      0.14       829



### Random Forest

In [21]:
# create random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [22]:
# train the model 
rf_model = rf_model.fit(F_train_scaled, y_train)
# predict the y values
predictions = rf_model.predict(F_test_scaled)

In [23]:
# Sort the features by importance:
sorted(zip(rf_model.feature_importances_, F.columns), reverse=True)

[(0.18521728978168356, 'elevation_gain'),
 (0.17597457247114823, 'length'),
 (0.04135260827566972, 'difficulty_rating'),
 (0.03490345999927995, 'forest'),
 (0.031949914298187644, 'wild-flowers'),
 (0.03128582735741272, 'trail-running'),
 (0.031017412227339874, 'wildlife'),
 (0.030817556293095363, 'river'),
 (0.029205568712195325, 'birding'),
 (0.027600093487355253, 'nature-trips'),
 (0.02569380398496491, 'lake'),
 (0.024038110756870377, 'route_type_out and back'),
 (0.02304569740926239, 'route_type_loop'),
 (0.021060331801123537, 'walking'),
 (0.020521770860742198, 'camping'),
 (0.019893232579628262, 'waterfall'),
 (0.017724583255013585, 'kids'),
 (0.017567238000615545, 'backpacking'),
 (0.017554387988897777, 'views'),
 (0.017111878177180618, 'dogs-no'),
 (0.015341633405919637, 'horseback-riding'),
 (0.014115657686430234, 'dogs'),
 (0.012184089340837718, 'dogs-leash'),
 (0.012139205137140858, 'route_type_point to point'),
 (0.010770655766678325, 'fishing'),
 (0.009180485950380218, 'hik

### 2nd attempt w/ less columns


In [24]:
# Remove all but first 15 columns in feature importance list
cols = ['elevation_gain','length','difficulty_rating','forest','wild-flowers','trail-running','wildlife','river','birding','nature-trips','lake','route_type_out and back','route_type_loop','walking','camping']
F = F[cols]
F

Unnamed: 0_level_0,elevation_gain,length,difficulty_rating,forest,wild-flowers,trail-running,wildlife,river,birding,nature-trips,lake,route_type_out and back,route_type_loop,walking,camping
trail_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10020048,1161.8976,15610.598,5,1,1,1,1,1,1,1,0,1,0,0,1
10236086,507.7968,6920.162,3,1,1,0,1,0,1,1,0,1,0,1,1
10267857,81.9912,2896.812,1,0,0,0,1,0,0,0,0,1,0,1,0
10236076,119.7864,3379.614,1,1,1,1,1,0,1,1,1,0,1,1,0
10236082,1124.7120,29772.790,5,0,1,1,1,0,1,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10008302,1105.8144,20116.750,5,0,1,0,0,0,1,1,0,0,1,0,0
10236001,1171.9560,28324.384,5,0,0,0,1,0,0,0,0,1,0,0,1
10258707,3.9624,321.868,1,0,0,0,0,0,0,0,0,1,0,1,0
10014989,1670.9136,19312.080,5,0,0,0,1,0,0,0,0,1,0,0,0


In [25]:
# split into test and train
from sklearn.model_selection import train_test_split
F_train, F_test, y_train, y_test = train_test_split(F, y, random_state=1, stratify=y)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
F_scaler = scaler.fit(F_train)

F_train_scaled = F_scaler.transform(F_train)
F_test_scaled = F_scaler.transform(F_test)

In [26]:

# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# fit the model
eec.fit(F_train, y_train)

# predict y
y_pred = eec.predict(F_test)

In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.11161137925180864

In [28]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[17,  1,  1,  3,  0,  0,  0,  3,  3,  1],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  1,  0,  1],
       [ 0,  1,  3,  0,  0,  1,  0,  0,  0,  0],
       [ 7,  0,  4,  3,  0,  0,  0,  2,  1,  3],
       [ 7,  3, 21, 10,  2,  1,  0,  0,  8,  1],
       [14,  3, 77, 30,  5,  2,  8, 31, 16, 15],
       [49, 13, 77, 55, 16,  0, 12, 69, 50, 40],
       [47,  4,  4, 12,  4,  0,  0, 18, 13, 33]], dtype=int64)

In [29]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.12      0.59      0.84      0.20      0.70      0.48        29
        1.0       0.00      0.00      0.97      0.00      0.00      0.00         1
        1.5       0.00      0.00      0.77      0.00      0.00      0.00         1
        2.0       0.00      0.00      0.86      0.00      0.00      0.00         3
        2.5       0.00      0.00      0.97      0.00      0.00      0.00         5
        3.0       0.00      0.00      1.00      0.00      0.00      0.00        20
        3.5       0.00      0.00      0.97      0.00      0.00      0.00        53
        4.0       0.25      0.15      0.85      0.19      0.36      0.12       201
        4.5       0.55      0.13      0.91      0.21      0.35      0.11       381
        5.0       0.35      0.24      0.91      0.29      0.47      0.21       135

avg / total       0.37      0.16      0.90      0.20      0.35      0.13       829



In [30]:
### Random Forest

# create random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# train the model 
rf_model = rf_model.fit(F_train_scaled, y_train)
# predict the y values
predictions = rf_model.predict(F_test_scaled)

# Sort the features by importance:
sorted(zip(rf_model.feature_importances_, F.columns), reverse=True)

[(0.28955681124513194, 'elevation_gain'),
 (0.2664465316967121, 'length'),
 (0.054487708763034264, 'difficulty_rating'),
 (0.040823236693372615, 'forest'),
 (0.03806620648668199, 'trail-running'),
 (0.03687929269638773, 'wild-flowers'),
 (0.036377964678524964, 'river'),
 (0.03519892995543863, 'wildlife'),
 (0.033800769084156315, 'birding'),
 (0.031809093117232096, 'nature-trips'),
 (0.029564309899595395, 'lake'),
 (0.028701562935453843, 'route_type_out and back'),
 (0.027343718797306148, 'camping'),
 (0.025963690545297395, 'route_type_loop'),
 (0.024980173405674613, 'walking')]

In [31]:
# find the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.12464456962123141