In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

# Read in the data
observations_r = pd.read_csv('observations.csv')
species_r = pd.read_csv('species_info.csv')

# Sum the observations for each species in each park
observations = observations_r.pivot_table(index=['scientific_name', 'park_name'], values='observations', aggfunc='sum').reset_index()

# Remove entries of species_r which have the same entries in scientific_name and keep selected conservation_status
species_r = species_r.drop_duplicates(subset=['scientific_name', 'conservation_status'], keep='first')
species = species_r.drop([8,560])

# Merge the two dataframes to analyze obersevations by species and their conservation status
wildlife = pd.merge(observations, species, on='scientific_name', how='left')

# EDA
- [X] Check data-types
- [X] Check cleanliness
- [X] Remove duplicates in both df before merging and check merge
- [ ] Check univariate
- [ ] Check association

In [2]:
display(wildlife.head())
display(wildlife.info())
display(wildlife.describe(include='all'))

Unnamed: 0,scientific_name,park_name,observations,category,common_names,conservation_status
0,Abies bifolia,Bryce National Park,109,Vascular Plant,Rocky Mountain Alpine Fir,
1,Abies bifolia,Great Smoky Mountains National Park,72,Vascular Plant,Rocky Mountain Alpine Fir,
2,Abies bifolia,Yellowstone National Park,215,Vascular Plant,Rocky Mountain Alpine Fir,
3,Abies bifolia,Yosemite National Park,136,Vascular Plant,Rocky Mountain Alpine Fir,
4,Abies concolor,Bryce National Park,83,Vascular Plant,"Balsam Fir, Colorado Fir, Concolor Fir, Silver...",


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22164 entries, 0 to 22163
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   scientific_name      22164 non-null  object
 1   park_name            22164 non-null  object
 2   observations         22164 non-null  int64 
 3   category             22164 non-null  object
 4   common_names         22164 non-null  object
 5   conservation_status  716 non-null    object
dtypes: int64(1), object(5)
memory usage: 1.0+ MB


None

Unnamed: 0,scientific_name,park_name,observations,category,common_names,conservation_status
count,22164,22164,22164.0,22164,22164,716
unique,5541,4,,7,5229,4
top,Abies bifolia,Bryce National Park,,Vascular Plant,Brachythecium Moss,Species of Concern
freq,4,5541,,17048,28,604
mean,,,149.555089,,,
std,,,81.421299,,,
min,,,9.0,,,
25%,,,88.0,,,
50%,,,129.0,,,
75%,,,217.0,,,


|Name|Description|
|----|-----------|
|Scientific name|It is the latin name for each species, which occures one time per park(there are 4 different parks).Hence, there are 22164/4 = 5541 unique species|
|Observations|are the number of observations per species per park. Ranges between 9 - 805 with median of 150|
|Category|is the category into which the observed species can be grouped|
|Common names|are the usually used names for each species|
|Conservation status|is the status of the oberved entity|

## Research questions
1. Which species has the most observations? Which one has the least?
2. Which categories have the most observations? Which ones have the least?
3. Relationship between category and conservation status
4. How many observations are made for each conservation status?
5. Which parks have the most unique species?
6. Which parks have the most overall observations?
7. Which parks host the most endangered species?
8. Observations by Conservation Status and Park

## 1. Which species has the most observations? Which one has the least?

In [14]:
top10 = wildlife.groupby('common_names').observations.sum().nlargest(10).reset_index()
least10 = wildlife.groupby('common_names').observations.sum().nsmallest(10).reset_index()

fig = px.bar(top10, x='common_names', y='observations', title='Top 10 Species by Observations')
fig.show()

In [15]:
fig = px.bar(least10, x='common_names', y='observations', title='Top 10 Species by Observations')
fig.show()

## 2. Which categories have the most observations? Which ones have the least?

In [17]:
top10_cat = wildlife.groupby('category').observations.sum().nlargest(10).reset_index()
least10_cat = wildlife.groupby('category').observations.sum().nsmallest(10).reset_index()

fig = px.bar(wildlife, x='category', y='observations', title='Top 10 Species by Observations')
fig.show()

## 4. How many observations are made for each conservation status?

In [10]:
fig = px.histogram(wildlife, x='conservation_status', y='observations', title='Observations by Conservation Status')
fig.show()

## 8.Observations by Conservation Status and Park

In [8]:
fig = px.histogram(wildlife, x='category', y='observations', color='park_name',
                    barmode='group', title='Observations by Conservation Status and Park')
fig.show()