In [48]:
import pandas as pd
import numpy as np
import csv
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

## Biodiversity Data

### Biodiversity - Working Data Attempt 1

In [2]:
#Load the species data from the species.csv file:

biodiversity_data_to_load = "./Resources/species.csv"
bio_df = pd.read_csv(biodiversity_data_to_load, low_memory=False)
bio_df.head()

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [3]:
# Get the information for the species_df:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119248 entries, 0 to 119247
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Species ID           119248 non-null  object
 1   Park Name            119248 non-null  object
 2   Category             119248 non-null  object
 3   Order                117776 non-null  object
 4   Family               117736 non-null  object
 5   Scientific Name      119248 non-null  object
 6   Common Names         119248 non-null  object
 7   Record Status        119248 non-null  object
 8   Occurrence           99106 non-null   object
 9   Nativeness           94203 non-null   object
 10  Abundance            76306 non-null   object
 11  Seasonality          20157 non-null   object
 12  Conservation Status  4718 non-null    object
 13  Unnamed: 13          5 non-null       object
dtypes: object(14)
memory usage: 12.7+ MB


In [4]:
# Creating new DataFrame with necessary data and leaving out unnecessary data
reduced_bio_df = bio_df[["Species ID", "Park Name", "Scientific Name",
                       "Occurrence", "Abundance", "Conservation Status"]].copy()
reduced_bio_df.head()

Unnamed: 0,Species ID,Park Name,Scientific Name,Occurrence,Abundance,Conservation Status
0,ACAD-1000,Acadia National Park,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Vulpes vulpes,Present,Common,


In [5]:
# Renaming Columns to standardize column names
renamed_bio_df = reduced_bio_df.rename(columns={"Species ID": "species_id",
                                        "Park Name": "park_name",
                                        "Scientific Name": "scientific_name",
                                        "Occurrence": "occurrence",
                                        "Abundance": "abundance",
                                        "Conservation Status": "conservation_status",
                                       })
renamed_bio_df.head()

Unnamed: 0,species_id,park_name,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Vulpes vulpes,Present,Common,


In [6]:
# remove any rows with conservations status of extinct 
renamed_bio_df['conservation_status'].unique()

array([nan, 'Species of Concern', 'Endangered', 'In Recovery',
       'Threatened', 'Under Review', 'Proposed Threatened', 'Extinct',
       'Proposed Endangered', 'Resident', 'Breeder', 'Migratory'],
      dtype=object)

In [7]:
# Checking data count
renamed_bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119248 entries, 0 to 119247
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   species_id           119248 non-null  object
 1   park_name            119248 non-null  object
 2   scientific_name      119248 non-null  object
 3   occurrence           99106 non-null   object
 4   abundance            76306 non-null   object
 5   conservation_status  4718 non-null    object
dtypes: object(6)
memory usage: 5.5+ MB


In [8]:
# remove extinct from conservation status data (it is not relevant)
updated_bio_df = renamed_bio_df.loc[lambda renamed_bio_df: renamed_bio_df['conservation_status'] != 'Extinct']
print(updated_bio_df['conservation_status'].unique())
updated_bio_df.head()

[nan 'Species of Concern' 'Endangered' 'In Recovery' 'Threatened'
 'Under Review' 'Proposed Threatened' 'Proposed Endangered' 'Resident'
 'Breeder' 'Migratory']


Unnamed: 0,species_id,park_name,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Vulpes vulpes,Present,Common,


In [9]:
# Checking data count
updated_bio_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119247 entries, 0 to 119247
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   species_id           119247 non-null  object
 1   park_name            119247 non-null  object
 2   scientific_name      119247 non-null  object
 3   occurrence           99105 non-null   object
 4   abundance            76306 non-null   object
 5   conservation_status  4717 non-null    object
dtypes: object(6)
memory usage: 6.4+ MB


In [10]:
# Remove any words after the work "Park" in park_names
p = '(.*Park)'
park_name_bio = updated_bio_df["park_name"].str.extract(p)

#Checking the regex function
park_name_bio.value_counts()

Great Smoky Mountains National Park           6623
Redwood National Park                         6310
Shenandoah National Park                      4655
Death Valley National Park                    4439
Yellowstone National Park                     3966
Crater Lake National Park                     3760
North Cascades National Park                  3363
Hawaii Volcanoes National Park                3298
Rocky Mountain National Park                  3152
Great Basin National Park                     2653
Grand Canyon National Park                    2622
Haleakala National Park                       2580
Glacier National Park                         2556
Mammoth Cave National Park                    2499
Congaree National Park                        2321
Joshua Tree National Park                     2294
Big Bend National Park                        2269
Yosemite National Park                        2088
Everglades National Park                      2084
Grand Teton National Park      

In [11]:
# exchange the new park_name column in the updated_bio_df
updated_bio_df["park_name"] = park_name_bio.values
updated_bio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,species_id,park_name,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Vulpes vulpes,Present,Common,
...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Solanum triflorum,Present,Uncommon,
119244,ZION-2792,Zion National Park,Vitis arizonica,Present,Uncommon,
119245,ZION-2793,Zion National Park,Vitis vinifera,Present,Uncommon,
119246,ZION-2794,Zion National Park,Larrea tridentata,Present,Rare,


In [12]:
# Create a variable that will store the counts of each species per park:
bio_count = updated_bio_df.groupby(["park_name"]).count()["species_id"]
print(bio_count)

park_name
Acadia National Park                          1709
Arches National Park                          1048
Badlands National Park                        1389
Big Bend National Park                        2269
Biscayne National Park                        1726
Black Canyon of the Gunnison National Park    1106
Bryce Canyon National Park                    1286
Canyonlands National Park                     1223
Capitol Reef National Park                    1566
Carlsbad Caverns National Park                1536
Channel Islands National Park                 1885
Congaree National Park                        2321
Crater Lake National Park                     3760
Cuyahoga Valley National Park                 1940
Death Valley National Park                    4439
Denali National Park                          1320
Dry Tortugas National Park                     848
Everglades National Park                      2084
Gates Of The Arctic National Park             1353
Glacier Bay National 

#### NOTE: Biodiversity data has been cleaned and put into updated_bio_df, 
#### NOTE: A variable is created to hold count of species per park (bio_count)

### Biodiversity Count Merge

#### Add in trails data below for merge:

In [13]:
# Load the trail data from trail_data.csv:

trails_data_to_load = "./Resources/trails_final.csv"
trails_final_df = pd.read_csv(trails_data_to_load, low_memory=False, encoding="utf-8")
trails_final_df.head(5)

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73049, 'lng': -148.91968}",18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73661, 'lng': -148.915}",16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,"{'lat': 63.73319, 'lng': -148.89682}",12.5935,29772.79,1124.712,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i


In [14]:
# Create a value for the number of trails per park
# Sequoia and Kings Canyon National Park should equal 90 + 61):

trail_count = trails_final_df.groupby(["park_name"]).count()["trail_id"].sort_values(ascending=False)
trail_count

park_name
Great Smoky Mountains National Park           293
Yosemite National Park                        242
Yellowstone National Park                     228
Rocky Mountain National Park                  207
Shenandoah National Park                      187
Acadia National Park                          179
Olympic National Park                         170
Sequoia and Kings Canyon National Park        151
Glacier National Park                         132
Mount Rainier National Park                   130
Joshua Tree National Park                     108
Grand Teton National Park                     102
Grand Canyon National Park                     99
Zion National Park                             94
Death Valley National Park                     79
Canyonlands National Park                      77
Big Bend National Park                         70
Saguaro National Park                          60
Lassen Volcanic National Park                  59
Cuyahoga Valley National Park           

In [15]:
# Create a value for the average popularity score for all trails within a certain park:

trail_pop_avg = trails_final_df.groupby(["park_name"]).mean()["popularity"].sort_values(ascending=False)
print(trail_pop_avg)

park_name
Kenai Fjords National Park                    21.337600
Bryce Canyon National Park                    16.617903
Haleakala National Park                       13.723586
Arches National Park                          13.462423
Badlands National Park                        13.141040
Rocky Mountain National Park                  13.007846
Crater Lake National Park                     12.430750
Zion National Park                            12.027557
Mount Rainier National Park                   11.726444
Glacier National Park                         10.061925
Grand Canyon National Park                     9.716001
Pinnacles National Park                        9.421904
Olympic National Park                          9.389508
North Cascades National Park                   9.374660
Grand Teton National Park                      9.373725
Sequoia and Kings Canyon National Park         9.243440
Yosemite National Park                         9.235686
Capitol Reef National Park            

In [16]:
# Load the trail count pop data from trail_count_pop.csv:

trail_count_pop_to_load = "./Resources/trail_count_pop.csv"
trail_count_pop_df = pd.read_csv(trail_count_pop_to_load, low_memory=False, encoding="utf-8")
trail_count_pop_df.head(5)

Unnamed: 0,park_name,trail_count,avg_popularity
0,Acadia National Park,179,8.37592
1,Arches National Park,48,13.462423
2,Badlands National Park,10,13.14104
3,Big Bend National Park,70,7.17638
4,Biscayne National Park,4,4.0552


#### Merge of trail_count, trail_pop_avg, bio_count DataFrames

In [17]:
# Create a new df that contains the count of trails, the average popularity of each park, and species count per park:

park_bio_df = pd.DataFrame({
    "trail_count": trail_count,
    "avg_popularity": trail_pop_avg,
    "species_count" : bio_count,
})

#Reset the index so the park name is no longer the index:

park_bio_df.reset_index(inplace=True)
park_bio_df = park_bio_df.rename(columns = {'index':'park_name'})

park_bio_df

Unnamed: 0,park_name,trail_count,avg_popularity,species_count
0,Acadia National Park,179.0,8.375920,1709.0
1,Arches National Park,48.0,13.462423,1048.0
2,Badlands National Park,10.0,13.141040,1389.0
3,Big Bend National Park,70.0,7.176380,2269.0
4,Biscayne National Park,4.0,4.055200,1726.0
...,...,...,...,...
57,Wolf Trap National Park,1.0,7.111900,
58,Wrangell - St Elias National Park,,,1796.0
59,Yellowstone National Park,228.0,7.006884,3966.0
60,Yosemite National Park,242.0,9.235686,2088.0


### Merge parks df with park bio data

In [18]:
# Load the park final data from parks_final.csv:

parks_final_data_to_load = "./Resources/parks_final.csv"
parks_final_df = pd.read_csv(parks_final_data_to_load, low_memory=False, encoding="utf-8")
parks_final_df.head(5)

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08


In [19]:
#Create a dataframe that populates the parks_df with trail count and popularity data from the trails df:
merged_parks_bio_df = pd.merge(parks_final_df, park_bio_df, on="park_name", how="left")
pd.options.display.max_rows = 100
merged_parks_bio_df

Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity,species_count
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179.0,8.37592,1709.0
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48.0,13.462423,1048.0
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10.0,13.14104,1389.0
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70.0,7.17638,2269.0
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4.0,4.0552,1726.0
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72,24.0,7.690121,1106.0
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18,31.0,16.617903,1286.0
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93,77.0,8.143584,1223.0
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17,42.0,9.006231,1566.0
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44,8.0,5.329462,1536.0


In [20]:
# Formatting the DataFrame
merged_parks_bio_df = merged_parks_bio_df.dropna(axis=0)
merged_parks_bio_df["trail_count"] = merged_parks_bio_df["trail_count"].map("{:,.0f}".format)
merged_parks_bio_df["avg_popularity"] = merged_parks_bio_df["avg_popularity"].map("{:.3f}".format)
merged_parks_bio_df["species_count"] = merged_parks_bio_df["species_count"].map("{:.0f}".format)
merged_parks_bio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,park_code,park_name,state,acres,latitude,longitude,trail_count,avg_popularity,species_count
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21,179,8.376,1709
1,ARCH,Arches National Park,UT,76519,38.68,-109.57,48,13.462,1048
2,BADL,Badlands National Park,SD,242756,43.75,-102.5,10,13.141,1389
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25,70,7.176,2269
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08,4,4.055,1726
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72,24,7.69,1106
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18,31,16.618,1286
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93,77,8.144,1223
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17,42,9.006,1566
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44,8,5.329,1536


In [21]:
# Write parks pop bio df to csv
merged_parks_bio_data_to_load = "./Resources/parks_pop_bio.csv"
merged_parks_bio_df.to_csv(merged_parks_bio_data_to_load, index=False)

### Biodiversity - Working Data Attempt 2

In [22]:
# Show DataFrame
bio_df.head()

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status,Unnamed: 13
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern,
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered,
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,,


In [23]:
# Show Data in DataFrame
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119248 entries, 0 to 119247
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Species ID           119248 non-null  object
 1   Park Name            119248 non-null  object
 2   Category             119248 non-null  object
 3   Order                117776 non-null  object
 4   Family               117736 non-null  object
 5   Scientific Name      119248 non-null  object
 6   Common Names         119248 non-null  object
 7   Record Status        119248 non-null  object
 8   Occurrence           99106 non-null   object
 9   Nativeness           94203 non-null   object
 10  Abundance            76306 non-null   object
 11  Seasonality          20157 non-null   object
 12  Conservation Status  4718 non-null    object
 13  Unnamed: 13          5 non-null       object
dtypes: object(14)
memory usage: 12.7+ MB


In [24]:
# Show an array of Occurrence values
bio_df["Occurrence"].unique()

array(['Present', 'Not Confirmed', 'Not Present (Historical Report)',
       'Not Present (False Report)', 'Approved', nan, 'Not Present',
       'In Review'], dtype=object)

In [25]:
# Rework DataFrame with needed inforamation (and remove not needed information)
reduced_bio_df = bio_df[["Species ID", "Park Name", "Category", "Scientific Name",
                       "Occurrence", "Abundance", "Conservation Status"]].copy()
reduced_bio_df.head()

Unnamed: 0,Species ID,Park Name,Category,Scientific Name,Occurrence,Abundance,Conservation Status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,Common,


In [26]:
# Standardize column names
renamed_bio_df = reduced_bio_df.rename(columns={"Species ID": "species_id",
                                        "Park Name": "park_name",
                                        "Category": "category",       
                                        "Scientific Name": "scientific_name",
                                        "Occurrence": "occurrence",
                                        "Abundance": "abundance",
                                        "Conservation Status": "conservation_status",
                                       })
renamed_bio_df.head()

Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,Common,


In [27]:
# Replace NaN values with "None"
renamed_bio_df = renamed_bio_df.replace(np.nan, 'None')
renamed_bio_df

Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,Common,
...,...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Vascular Plant,Solanum triflorum,Present,Uncommon,
119244,ZION-2792,Zion National Park,Vascular Plant,Vitis arizonica,Present,Uncommon,
119245,ZION-2793,Zion National Park,Vascular Plant,Vitis vinifera,Present,Uncommon,
119246,ZION-2794,Zion National Park,Vascular Plant,Larrea tridentata,Present,Rare,


In [28]:
# remove "not present" in occurrence column
working_bio_df = renamed_bio_df.loc[renamed_bio_df['occurrence'].isin(['Present', 'Not Confirmed', 'Approved', 'None', 'In Review'])]
print(working_bio_df['occurrence'].unique())
working_bio_df

['Present' 'Not Confirmed' 'Approved' 'None' 'In Review']


Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,Rare,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,Common,
...,...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Vascular Plant,Solanum triflorum,Present,Uncommon,
119244,ZION-2792,Zion National Park,Vascular Plant,Vitis arizonica,Present,Uncommon,
119245,ZION-2793,Zion National Park,Vascular Plant,Vitis vinifera,Present,Uncommon,
119246,ZION-2794,Zion National Park,Vascular Plant,Larrea tridentata,Present,Rare,


In [29]:
# Show Values
working_bio_df["occurrence"].value_counts()

Present          83278
None             20142
Not Confirmed    11958
Approved            51
In Review            6
Name: occurrence, dtype: int64

In [30]:
# Show an array of Abundance values
working_bio_df["abundance"].unique()

array(['Rare', 'Abundant', 'Common', 'None', 'Uncommon', 'Occasional',
       'Unknown', 'Not Native', 'Native'], dtype=object)

In [31]:
# Categorize Abundance based on rare and not rare (Rare/Uncommon = FALSE, Rest = TRUE)
# This will encode into 1's and 0's for machine learning in future
# You can use this pseudocode for future column cleaning
working_bio_df['abundance'] = working_bio_df['abundance'].replace(['Rare','Uncommon'],'FALSE')
working_bio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,FALSE,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,Abundant,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,Common,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,Common,
...,...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Vascular Plant,Solanum triflorum,Present,FALSE,
119244,ZION-2792,Zion National Park,Vascular Plant,Vitis arizonica,Present,FALSE,
119245,ZION-2793,Zion National Park,Vascular Plant,Vitis vinifera,Present,FALSE,
119246,ZION-2794,Zion National Park,Vascular Plant,Larrea tridentata,Present,FALSE,


In [32]:
# Categorize Abundance based on rare and not rare (Rare/Uncommon = FALSE, Rest = TRUE)
# This will encode into 1's and 0's for machine learning in future
# You can use this pseudocode for future column cleaning

working_bio_df['abundance'] = working_bio_df['abundance'].replace(['Abundant', 'Common', 'None', 'Occasional',
       'Unknown', 'Not Native', 'Native'],'TRUE')
working_bio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,FALSE,
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,TRUE,
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,TRUE,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,TRUE,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,TRUE,
...,...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Vascular Plant,Solanum triflorum,Present,FALSE,
119244,ZION-2792,Zion National Park,Vascular Plant,Vitis arizonica,Present,FALSE,
119245,ZION-2793,Zion National Park,Vascular Plant,Vitis vinifera,Present,FALSE,
119246,ZION-2794,Zion National Park,Vascular Plant,Larrea tridentata,Present,FALSE,


In [33]:
# Show data values
working_bio_df["conservation_status"].value_counts()

None                   111023
Species of Concern       3665
Endangered                295
Under Review              175
Threatened                165
In Recovery                74
Proposed Endangered        18
Breeder                     7
Proposed Threatened         6
Resident                    5
Migratory                   2
Name: conservation_status, dtype: int64

In [34]:
# Show an array of Conservation Status values
working_bio_df["conservation_status"].unique()

array(['None', 'Species of Concern', 'Endangered', 'In Recovery',
       'Threatened', 'Under Review', 'Proposed Threatened',
       'Proposed Endangered', 'Resident', 'Breeder', 'Migratory'],
      dtype=object)

In [35]:
# Categorize Conservation based on having a status vs. None
# This will encode into 1's and 0's for machine learning in future
# You can use this pseudocode for future column cleaning
working_bio_df['conservation_status'] = working_bio_df['conservation_status'].replace(['Species of Concern', 'Endangered', 'In Recovery',
       'Threatened', 'Under Review', 'Proposed Threatened','Proposed Endangered', 'Resident', 'Breeder', 'Migratory'],'TRUE')

working_bio_df['conservation_status'] = working_bio_df['conservation_status'].replace(['None'],'FALSE')
working_bio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,species_id,park_name,category,scientific_name,occurrence,abundance,conservation_status
0,ACAD-1000,Acadia National Park,Mammal,Alces alces,Present,FALSE,FALSE
1,ACAD-1001,Acadia National Park,Mammal,Odocoileus virginianus,Present,TRUE,FALSE
2,ACAD-1002,Acadia National Park,Mammal,Canis latrans,Present,TRUE,TRUE
3,ACAD-1003,Acadia National Park,Mammal,Canis lupus,Not Confirmed,TRUE,TRUE
4,ACAD-1004,Acadia National Park,Mammal,Vulpes vulpes,Present,TRUE,FALSE
...,...,...,...,...,...,...,...
119243,ZION-2791,Zion National Park,Vascular Plant,Solanum triflorum,Present,FALSE,FALSE
119244,ZION-2792,Zion National Park,Vascular Plant,Vitis arizonica,Present,FALSE,FALSE
119245,ZION-2793,Zion National Park,Vascular Plant,Vitis vinifera,Present,FALSE,FALSE
119246,ZION-2794,Zion National Park,Vascular Plant,Larrea tridentata,Present,FALSE,FALSE


In [36]:
# Write working_bio_df to CSV:

merged_data_to_load = "./Resources/working_bio_df.csv"
working_bio_df.to_csv(merged_data_to_load, index=False)

## Grouping Bio and Trails Data for Machine Learning

### Add aggregated fields from the working_bio_df to trails_final_df

In [37]:
print(working_bio_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115435 entries, 0 to 119247
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   species_id           115435 non-null  object
 1   park_name            115435 non-null  object
 2   category             115435 non-null  object
 3   scientific_name      115435 non-null  object
 4   occurrence           115435 non-null  object
 5   abundance            115435 non-null  object
 6   conservation_status  115435 non-null  object
dtypes: object(7)
memory usage: 7.0+ MB
None


In [38]:
# Show an array of various column values
print(working_bio_df["species_id"].unique()) # not needed in final df
print(working_bio_df["park_name"].unique()) # group by park
print(working_bio_df["category"].unique()) # count category
print(working_bio_df["scientific_name"].unique()) # count names
print(working_bio_df["occurrence"].unique()) # will be deleted
print(working_bio_df["abundance"].unique()) # column has been cleaned for ML
print(working_bio_df["conservation_status"].unique()) # column has been cleaned for ML

['ACAD-1000' 'ACAD-1001' 'ACAD-1002' ... 'ZION-2793' 'ZION-2794'
 'ZION-2795']
['Acadia National Park' 'Arches National Park' 'Badlands National Park'
 'Big Bend National Park' 'Biscayne National Park'
 'Black Canyon of the Gunnison National Park' 'Bryce Canyon National Park'
 'Canyonlands National Park' 'Capitol Reef National Park'
 'Carlsbad Caverns National Park' 'Channel Islands National Park'
 'Congaree National Park' 'Crater Lake National Park'
 'Cuyahoga Valley National Park' 'Denali National Park and Preserve'
 'Death Valley National Park' 'Dry Tortugas National Park'
 'Everglades National Park'
 'Gates Of The Arctic National Park and Preserve' 'Glacier National Park'
 'Glacier Bay National Park and Preserve' 'Great Basin National Park'
 'Grand Canyon National Park'
 'Great Sand Dunes National Park and Preserve'
 'Great Smoky Mountains National Park' 'Grand Teton National Park'
 'Guadalupe Mountains National Park' 'Haleakala National Park'
 'Hawaii Volcanoes National Park' 'Hot

In [39]:
# import csv from sql as dataframes

file_path1 = "./Resources/bio_category.csv"
bio_category_df = pd.read_csv(file_path1, low_memory=False)
bio_category_df.head()


Unnamed: 0,park_name,category,count
0,Acadia National Park,Bird,331
1,Acadia National Park,Amphibian,12
2,Acadia National Park,Mammal,48
3,Acadia National Park,Fish,34
4,Acadia National Park,Reptile,10


In [40]:
# import csv from sql as dataframes

file_path2 = "./Resources/bio_count_per_park.csv"
bio_count_per_park_df = pd.read_csv(file_path2, low_memory=False)
bio_count_per_park_df.head()

Unnamed: 0,park_name,count
0,Acadia National Park,1572
1,Arches National Park,1006
2,Badlands National Park,1365
3,Big Bend National Park,2258
4,Biscayne National Park,1590


In [41]:
# create new dataframe for count of bio categories
# count_bio_categories_df = pd.DataFrame(columns=('park_name',
#                                                 'Algae',
#                                                 'Amphibian',
#                                                 'Bird',
#                                                 'Crab/Lobster/Shrimp',
#                                                 'Fish',
#                                                 'Fungi',
#                                                 'Insect',
#                                                 'Invertebrate',
#                                                 'Mammal',
#                                                 'Nonvascular Plant',
#                                                 'Reptile',
#                                                 'Slug/Snail',
#                                                 'Spider/Scorpion',
#                                                 'Vascular Plant'))                 

# #count_bio_categories_df

# count_bio_categories_df.append(bio_category_df)

# count_bio_categories_df.head() DIDNT WORK ############

# create new dataframe for count of bio categories

count_bio_categories_df = bio_category_df.groupby(['park_name','category']).sum(['category'])
count_bio_categories_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
park_name,category,Unnamed: 2_level_1
Acadia National Park,Amphibian,12
Acadia National Park,Bird,331
Acadia National Park,Fish,34
Acadia National Park,Mammal,48
Acadia National Park,Reptile,10
...,...,...
Zion National Park,Bird,297
Zion National Park,Fish,15
Zion National Park,Mammal,80
Zion National Park,Reptile,30


In [42]:
# Write count_bio_categories_df to CSV:

merged_data_to_load = "./Resources/count_bio_categories.csv"
count_bio_categories_df.to_csv(merged_data_to_load, index=False)

In [43]:
# create a count of total different species categories per park
count_bio_categories = bio_category_df.groupby(['park_name']).count()['category']
count_bio_categories

park_name
Acadia National Park                               6
Arches National Park                               6
Badlands National Park                            10
Big Bend National Park                             7
Biscayne National Park                             8
Black Canyon of the Gunnison National Park         6
Bryce Canyon National Park                         6
Canyonlands National Park                          6
Capitol Reef National Park                         6
Carlsbad Caverns National Park                     6
Channel Islands National Park                     13
Congaree National Park                            14
Crater Lake National Park                         14
Cuyahoga Valley National Park                     11
Death Valley National Park                        14
Denali National Park and Preserve                  9
Dry Tortugas National Park                         6
Everglades National Park                           6
Gates Of The Arctic National Park an

In [44]:
# count of bio categories
cat_count = working_bio_df.groupby(["park_name"]).count()["category"]
cat_count

park_name
Acadia National Park                              1572
Arches National Park                              1006
Badlands National Park                            1365
Big Bend National Park                            2258
Biscayne National Park                            1590
Black Canyon of the Gunnison National Park        1099
Bryce Canyon National Park                        1189
Canyonlands National Park                         1166
Capitol Reef National Park                        1456
Carlsbad Caverns National Park                    1528
Channel Islands National Park                     1874
Congaree National Park                            2311
Crater Lake National Park                         3741
Cuyahoga Valley National Park                     1906
Death Valley National Park                        4386
Denali National Park and Preserve                 1309
Dry Tortugas National Park                         779
Everglades National Park                          1903


In [45]:
# create df of parks and species_count, category_count, and scientific_name count
biocount_parks_df = pd.DataFrame({
    "species_count" : cat_count,
    "bio_category_count":count_bio_categories
})

#Reset the index so the park name is no longer the index:

biocount_parks_df.reset_index(inplace=True)
biocount_parks_df = biocount_parks_df.rename(columns = {'index':'park_name'})

biocount_parks_df


Unnamed: 0,park_name,species_count,bio_category_count
0,Acadia National Park,1572,6
1,Arches National Park,1006,6
2,Badlands National Park,1365,10
3,Big Bend National Park,2258,7
4,Biscayne National Park,1590,8
5,Black Canyon of the Gunnison National Park,1099,6
6,Bryce Canyon National Park,1189,6
7,Canyonlands National Park,1166,6
8,Capitol Reef National Park,1456,6
9,Carlsbad Caverns National Park,1528,6


In [46]:
merged_trails_bio_df = pd.merge(trails_final_df, biocount_parks_df, on="park_name", how="inner")
pd.options.display.max_rows = 150000
merged_trails_bio_df

Unnamed: 0,trail_id,name,park_name,city_name,state_name,country_name,_geoloc,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,species_count,bio_category_count
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18852, 'lng': -149.63156}",24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,1015,9
1,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,"{'lat': 60.18879, 'lng': -149.631}",17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,1015,9
2,10187810,Bright Angel Trail to Bright Angel Campground ...,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05735, 'lng': -112.14381}",37.4791,28485.318,1525.8288,5,out and back,3.0,5.0,670,"['dogs-no', 'river', 'views', 'wild-flowers', ...","['backpacking', 'camping', 'hiking', 'nature-t...",i,2604,10
3,10016964,South Kaibab Trail to Cedar Ridge,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05346, 'lng': -112.08361}",36.2709,4988.954,358.7496,3,out and back,3.0,5.0,489,"['cave', 'dogs-no', 'forest', 'partially-paved...","['birding', 'camping', 'hiking', 'nature-trips...",i,2604,10
4,10237812,Three-Mile Resthouse via Bright Angel Trail,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05701, 'lng': -112.14414}",33.256,8690.436,635.8128,5,out and back,3.0,4.5,454,"['dogs-no', 'river', 'views', 'wild-flowers', ...","['backpacking', 'birding', 'hiking', 'nature-t...",i,2604,10
5,10245012,"South Kaibab, Phantom Ranch, and Bright Angel ...",Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05344, 'lng': -112.08364}",31.6323,26875.978,1400.8608,5,point to point,3.0,5.0,519,"['dogs-no', 'river', 'views', 'waterfall', 'wi...","['backpacking', 'birding', 'camping', 'hiking'...",i,2604,10
6,10265905,South Kaibab Trail to Ooh Aah Point,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05309, 'lng': -112.08387}",28.8685,2896.812,210.9216,3,out and back,3.0,5.0,455,"['dogs-no', 'views', 'wildlife']","['birding', 'hiking', 'nature-trips', 'walking']",i,2604,10
7,10266148,Grand Canyon Rim Trail,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.05723, 'lng': -112.14378}",28.2035,8690.436,106.9848,1,out and back,3.0,4.5,342,"['ada', 'dogs-leash', 'kids', 'paved', 'stroll...",['walking'],i,2604,10
8,10094721,Rim-to-Rim: North Kaibab to Grand Canyon Village,Grand Canyon National Park,North Rim,Arizona,United States,"{'lat': 36.21692, 'lng': -112.05678}",24.0229,34761.744,1614.8304,5,point to point,3.0,5.0,217,"['dogs-no', 'river', 'views', 'waterfall', 'wi...","['backpacking', 'camping', 'hiking', 'trail-ru...",i,2604,10
9,10026816,Shoshone Point Trail,Grand Canyon National Park,Grand Canyon,Arizona,United States,"{'lat': 36.03496, 'lng': -112.06924}",22.7206,3379.614,45.72,1,out and back,1.0,5.0,224,"['ada', 'dogs-no', 'kids', 'strollers', 'views...","['birding', 'hiking', 'nature-trips', 'trail-r...",i,2604,10


In [47]:
# Write merged_trails_bio_df to CSV:

merged_data_to_load = "./Resources/trails_bio_combined.csv"
merged_trails_bio_df.to_csv(merged_data_to_load, index=False)