In [1]:
import pandas as pd
import numpy as np
import re
import requests

In [2]:
### to see all columns
pd.set_option('display.max_columns', None)

In [3]:
### Read the csv file from ECOS
ecos_species = pd.read_csv("../data/FWS_Species_Data_Explorer.csv")
ecos_species.head()

Unnamed: 0,Common Name,Scientific Name,Scientific Name_url,Species Group,ESA Listing Status,ESA Listing Date,Entity Description,Foreign or Domestic,Image URL,Image URL_url,Range Envelope,State Abbreviation,State Name,Country Name,Food Habits,Habitat Requirements,Movement / Home Range,Other Information,Reproductive Strategy,Species Description
0,Abbott's booby,Papasula (=Sula) abbotti,https://ecos.fws.gov/ecp/species/1470,Birds,Endangered,06-14-1976,Wherever found,Foreign,/docs/species_images/doc1650.jpg,https://ecos.fws.gov/docs/species_images/doc16...,,,,Christmas Island,,,,,,
1,Acklins ground iguana,Cyclura rileyi nuchalis,https://ecos.fws.gov/ecp/species/27,Reptiles,Threatened,06-22-1983,Wherever found,Foreign,/docs/species_images/doc1803.jpg,https://ecos.fws.gov/docs/species_images/doc18...,,,,Bahamas,,,,,,
2,African elephant,Loxodonta africana,https://ecos.fws.gov/ecp/species/7724,Mammals,Threatened,06-11-1978,Wherever found,Foreign,/docs/species_images/doc1458.jpg,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sudan,,,,,,
3,African elephant,Loxodonta africana,https://ecos.fws.gov/ecp/species/7724,Mammals,Threatened,06-11-1978,Wherever found,Foreign,/docs/species_images/doc1458.jpg,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sierra Leone,,,,,,
4,African elephant,Loxodonta africana,https://ecos.fws.gov/ecp/species/7724,Mammals,Threatened,06-11-1978,Wherever found,Foreign,/docs/species_images/doc1458.jpg,https://ecos.fws.gov/docs/species_images/doc14...,,,,Kenya,,,,,,


## I/ Explore the columns and drop some columns:

In [4]:
#ecos_species["Conservation Plan Type"].unique()

* Definition:\
HCP = Habitat conservation plans  
CCAA = Candidate Conservation Agreements with Assurances   
CCA = Candidate Conservation Agreements  
SHA = Safe Harbor Agreements 

In [5]:
ecos_species["Species Group"].unique()

array(['Birds', 'Reptiles', 'Mammals', 'Amphibians', 'Flowering Plants',
       'Clams', 'Insects', 'Fishes', 'Crustaceans', 'Snails', 'Arachnids',
       'Ferns and Allies', 'Corals'], dtype=object)

In [6]:
ecos_species["Entity Description"].unique()

array(['Wherever found',
       'Wherever found; Except where listed as Experimental Populations',
       'U.S.A. (AL;The free-flowing reach of the Tennessee R. from the base of Wilson Dam downstream to the backwaters of Pickwick Reservoir [about 12 RM (19 km)] and the lower 5 RM [8 km] of all tributaries to this reach in Colbert and Lauderdale Cos., see 17.85(a))',
       'U.S.A. (LA, all counties; MS, TX, only within the historic county range of the Louisiana black bear)',
       'Wherever found, except where listed as an experimental population',
       'In southwestern Missouri, the counties of Cedar, St. Clair, Bates, and Vernon.',
       nan, 'U.S.A. (FL)', 'Wherever found, except in U.S.A. (FL)',
       'Upper Missouri River DPS', 'Kyrgyzstan, Mongolia, and Tajikistan',
       'Wherever found except Kyrgyzstan, Mongolia, and Tajikistan',
       'Kennebec River DPS', 'Gulf of Maine DPS - See 50 CFR 224.101',
       'U.S.A. ME Gulf of Maine Atlantic Salmon Distinct Population Segm

In [7]:
### drop columns
ecos_species = ecos_species.drop(["Scientific Name_url", "ESA Listing Date", "Entity Description", "Image URL"], axis = 1)

In [8]:
ecos_species.shape

(9390, 16)

In [9]:
### In case if there is duplicates that need to be dropped
ecos_species = ecos_species.drop_duplicates()

In [10]:
ecos_species.shape

(9337, 16)

In [11]:
# Rename column other information to threat comments
ecos_species.rename(columns = {'Other Information':'Threat Comments'}, inplace = True) 

## II/ Filter rows:

### 1- Filter species status according to ESA (Endangered Species Act):

In [12]:
ecos_species["ESA Listing Status"].unique()

array(['Endangered', 'Threatened',
       'Experimental Population, Non-Essential', 'Recovery',
       'Species of Concern', 'Similarity of Appearance (Threatened)',
       'Resolved Taxon', 'Proposed Threatened', 'Under Review',
       'Proposed Experimental Population, Non-Essential', 'Extinction',
       'Status Undefined',
       'Original Data in Error - New Information Discovered',
       'Proposed Similarity of Appearance (Threatened)',
       'Proposed Endangered', 'Candidate',
       'Original Data in Error - Taxonomic Revision'], dtype=object)

In [13]:
### I only want endangered, threatened, etc.
ecos_end_species = ecos_species[ecos_species["ESA Listing Status"].isin(["Threatened", "Species of Concern", "Endangered", "Similarity of Appearance (Threatened)", "Extinction", "Proposed Threatened", "Proposed Endangered", "Proposed Similarity of Appearance (Threatened)"])]

In [14]:
ecos_end_species["ESA Listing Status"].unique()

array(['Endangered', 'Threatened', 'Species of Concern',
       'Similarity of Appearance (Threatened)', 'Proposed Threatened',
       'Extinction', 'Proposed Similarity of Appearance (Threatened)',
       'Proposed Endangered'], dtype=object)

In [15]:
### For simplicity change "proposed... etc."" for threatened or endangered
ecos_end_species["ESA Listing Status"] = ecos_end_species["ESA Listing Status"].replace("Similarity of Appearance (Threatened)", "Threatened")
ecos_end_species["ESA Listing Status"] = ecos_end_species["ESA Listing Status"].replace("Proposed Threatened", "Threatened")
ecos_end_species["ESA Listing Status"] = ecos_end_species["ESA Listing Status"].replace("Proposed Similarity of Appearance (Threatened)", "Threatened")
ecos_end_species["ESA Listing Status"] = ecos_end_species["ESA Listing Status"].replace("Proposed Endangered", "Endangered")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [16]:
ecos_end_species["ESA Listing Status"].unique()

array(['Endangered', 'Threatened', 'Species of Concern', 'Extinction'],
      dtype=object)

### 2- Fix NaN and \r\n:

In [17]:
### Fix "\r\n" in column "Movement..." with "No info available" 
dict = {'\r\n':"No information available"}
ecos_end_species = ecos_end_species.replace(dict) 

In [18]:
### Fix NaN values in columns with NaN in it with "No info available" 
ecos_end_species['Food Habits'] = ecos_end_species['Food Habits'].fillna("No information available")
ecos_end_species['Habitat Requirements'] = ecos_end_species['Habitat Requirements'].fillna("No information available")
ecos_end_species['Movement / Home Range'] = ecos_end_species['Movement / Home Range'].fillna("No information available")
ecos_end_species['Threat Comments'] = ecos_end_species['Threat Comments'].fillna("No information available")
ecos_end_species['Reproductive Strategy'] = ecos_end_species['Reproductive Strategy'].fillna("No information available")
ecos_end_species['Species Description'] = ecos_end_species['Species Description'].fillna("No information available")

In [19]:
ecos_end_species.head(10)

Unnamed: 0,Common Name,Scientific Name,Species Group,ESA Listing Status,Foreign or Domestic,Image URL_url,Range Envelope,State Abbreviation,State Name,Country Name,Food Habits,Habitat Requirements,Movement / Home Range,Threat Comments,Reproductive Strategy,Species Description
0,Abbott's booby,Papasula (=Sula) abbotti,Birds,Endangered,Foreign,https://ecos.fws.gov/docs/species_images/doc16...,,,,Christmas Island,No information available,No information available,No information available,No information available,No information available,No information available
1,Acklins ground iguana,Cyclura rileyi nuchalis,Reptiles,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc18...,,,,Bahamas,No information available,No information available,No information available,No information available,No information available,No information available
2,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sudan,No information available,No information available,No information available,No information available,No information available,No information available
3,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sierra Leone,No information available,No information available,No information available,No information available,No information available,No information available
4,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Kenya,No information available,No information available,No information available,No information available,No information available,No information available
5,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Congo (Zaire),No information available,No information available,No information available,No information available,No information available,No information available
6,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Cameroon,No information available,No information available,No information available,No information available,No information available,No information available
7,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Uganda,No information available,No information available,No information available,No information available,No information available,No information available
8,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Central African Republic,No information available,No information available,No information available,No information available,No information available,No information available
9,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Nigeria,No information available,No information available,No information available,No information available,No information available,No information available


### 3- Fix html tags "< i>" and "< /i>": 

In [20]:
### Make a function to get ride of html tags <i> </i>
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

In [21]:
### Apply the function in columns with the html tags
ecos_end_species["Species Description"] = ecos_end_species["Species Description"].apply(striphtml)
ecos_end_species["Threat Comments"] = ecos_end_species["Threat Comments"].apply(striphtml)

In [22]:
ecos_end_species.head(10)

Unnamed: 0,Common Name,Scientific Name,Species Group,ESA Listing Status,Foreign or Domestic,Image URL_url,Range Envelope,State Abbreviation,State Name,Country Name,Food Habits,Habitat Requirements,Movement / Home Range,Threat Comments,Reproductive Strategy,Species Description
0,Abbott's booby,Papasula (=Sula) abbotti,Birds,Endangered,Foreign,https://ecos.fws.gov/docs/species_images/doc16...,,,,Christmas Island,No information available,No information available,No information available,No information available,No information available,No information available
1,Acklins ground iguana,Cyclura rileyi nuchalis,Reptiles,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc18...,,,,Bahamas,No information available,No information available,No information available,No information available,No information available,No information available
2,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sudan,No information available,No information available,No information available,No information available,No information available,No information available
3,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Sierra Leone,No information available,No information available,No information available,No information available,No information available,No information available
4,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Kenya,No information available,No information available,No information available,No information available,No information available,No information available
5,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Congo (Zaire),No information available,No information available,No information available,No information available,No information available,No information available
6,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Cameroon,No information available,No information available,No information available,No information available,No information available,No information available
7,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Uganda,No information available,No information available,No information available,No information available,No information available,No information available
8,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Central African Republic,No information available,No information available,No information available,No information available,No information available,No information available
9,African elephant,Loxodonta africana,Mammals,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc14...,,,,Nigeria,No information available,No information available,No information available,No information available,No information available,No information available


### 4- Fix column names:

In [23]:
ecos_end_species.rename(columns = {"Common Name":"common_name", 'Scientific Name':'scientific_name', "Species Group": "species_group", "ESA Listing Status": "esa_status", "Foreign or Domestic": "foreign_or_domestic", "Image URL_url":"image_url_full", "Range Envelope": "location", "State Abbreviation": "state_abbr", "State Name": "state", "Country Name":"country", "Food Habits": "food", "Habitat Requirements": "habitat_requirements", "Movement / Home Range": "home_range", "Threat Comments": "cause", "Reproductive Strategy": "reproduction", "Species Description": "description" }, inplace = True)
ecos_end_species.head(2)

Unnamed: 0,common_name,scientific_name,species_group,esa_status,foreign_or_domestic,image_url_full,location,state_abbr,state,country,food,habitat_requirements,home_range,cause,reproduction,description
0,Abbott's booby,Papasula (=Sula) abbotti,Birds,Endangered,Foreign,https://ecos.fws.gov/docs/species_images/doc16...,,,,Christmas Island,No information available,No information available,No information available,No information available,No information available,No information available
1,Acklins ground iguana,Cyclura rileyi nuchalis,Reptiles,Threatened,Foreign,https://ecos.fws.gov/docs/species_images/doc18...,,,,Bahamas,No information available,No information available,No information available,No information available,No information available,No information available


### 5- Make a dataframe with all countries and remove the states (since info only for USA):

In [24]:
### Drop states in the df 
ecos_end_species_countries = ecos_end_species.drop(["state_abbr", "state"], axis = 1)

In [25]:
ecos_end_species_countries.shape

(8093, 14)

In [26]:
### Drop duplicates
ecos_end_species_countries = ecos_end_species_countries.drop_duplicates()

In [27]:
ecos_end_species_countries.shape

(1723, 14)

### 6- Make a dataframe with only Canada and USA (these are the countries with the majority of information):

In [28]:
ecos_end_species["country"].unique()

array(['Christmas Island', 'Bahamas', 'Sudan', 'Sierra Leone', 'Kenya',
       'Congo (Zaire)', 'Cameroon', 'Uganda', 'Central African Republic',
       'Nigeria', 'Chad', 'Burkina Faso', 'Togo', 'Niger', 'Gabon',
       'Senegal', 'Tanzania', 'Mauritania', 'Ivory Coast', 'Benin',
       'Congo', 'Liberia', 'Ghana', 'Mali', 'Guinea', 'Burundi', nan,
       'Ethiopia', 'United States', 'Canada', 'Colombia', 'Haiti',
       'Guatemala', 'Mexico', 'El Salvador', 'Peru', 'Ecuador',
       'Costa Rica', 'Honduras', 'Jamaica', 'Belize',
       'Dominican Republic', 'Venezuela', 'Nicaragua', 'Cuba',
       'French Southern Territories', 'Argentina', 'Chile',
       'British Virgin Islands', 'Madagascar', 'Italy', 'Israel', 'Egypt',
       'Saudi Arabia', 'Jordan', 'Oman', 'Mongolia', 'Tajikistan',
       'Russia', 'Afghanistan', 'Kazakstan', 'Uzbekistan', 'India',
       'Pakistan', 'China', 'Kyrgyzstan', 'Nepal', 'Aruba', 'Vietnam',
       'Thailand', 'Myanmar (Burma)', 'Indonesia', 'Cambodi

In [29]:
### Looks like I have most information for US and maybe canada Canada ( as well as for NatureServe explorer if I use it at some point) 
ecos_end_species_us_ca = ecos_end_species[ecos_end_species["country"].isin(["United States", "Canada"])]

In [30]:
ecos_end_species_us_ca["country"].unique()

array(['United States', 'Canada'], dtype=object)

In [31]:
ecos_end_species_us_ca.shape

(1883, 16)

In [32]:
### Drop states in the df 
ecos_end_species_us_ca = ecos_end_species_us_ca.drop(["state_abbr", "state"], axis = 1)

In [33]:
### Drop duplicates
ecos_end_species_us_ca = ecos_end_species_us_ca.drop_duplicates()

In [34]:
ecos_end_species_us_ca.shape

(571, 14)

In [35]:
# If I want to keep only animals and drop plants but so far I will keep them all
#ecos_end_animal_species = ecos_end_species.loc[(ecos_end_species["Species Group"] != "Flowering Plants") & (ecos_end_species["Species Group"] != "Ferns and Allies")]

### 7- Make a dataframe with only USA (only country with states info):

In [36]:
ecos_end_species_us = ecos_end_species[ecos_end_species["country"].isin(["United States"])]

In [37]:
ecos_end_species_us.shape

(1581, 16)

In [38]:
### Drop duplicates
ecos_end_species_us = ecos_end_species_us.drop_duplicates()

In [39]:
ecos_end_species_us.shape

(1581, 16)

### 8- Make a dataframe with only states to avoid duplicates for USA dataframe:

In [40]:
### new df with only the states
ecos_states_us = ecos_end_species_us.loc[:, ["common_name", "scientific_name", "species_group", "esa_status", "foreign_or_domestic", "state", "state_abbr", "country" ]]

In [41]:
ecos_states_us.shape

(1581, 8)

In [42]:
ecos_states_us.head(2)

Unnamed: 0,common_name,scientific_name,species_group,esa_status,foreign_or_domestic,state,state_abbr,country
34,Akekee,Loxops caeruleirostris,Birds,Endangered,Domestic,Hawaii,HI,United States
35,Akikiki,Oreomystis bairdi,Birds,Endangered,Domestic,Hawaii,HI,United States


In [43]:
### Drop duplicates
ecos_states_us = ecos_states_us.drop_duplicates()

In [44]:
ecos_states_us.shape

(1276, 8)

In [None]:
### Drop states in the df with animal information
ecos_end_species_us = ecos_end_species_us.drop(["state", "state_abbr"], axis = 1)

In [None]:
### Drop duplicates
ecos_end_species_us = ecos_end_species_us.drop_duplicates()

In [None]:
ecos_end_species_us.shape

In [None]:
ecos_end_species_us.head(10)

### 9- Make a dataframe with all countries for treemap but drop pictures and location:

In [None]:
ecos_end_species_countries.head(2)

In [None]:
### make a new df for treemap purpose
ecos_end_species_treemap = ecos_end_species_countries.loc[:, ["common_name", "scientific_name", "species_group", "esa_status", "foreign_or_domestic", "country"]]
ecos_end_species_treemap.head(2)

In [None]:
ecos_end_species_treemap.shape

In [None]:
### Drop duplicates
ecos_end_species_treemap = ecos_end_species_treemap.drop_duplicates()

In [None]:
ecos_end_species_treemap.shape

### 10- Look if there is any data missing:

In [None]:
### function to ask if some values are missing... if it's True, yes there is missing information
ecos_end_species_us_ca.isnull().values.any()

In [None]:
### To know how many values are missing
ecos_end_species_us_ca.isnull().sum().sum()

In [None]:
### Use describe to localize in which column the data is missing
ecos_end_species_us_ca.describe()

There are 6 values missing in the column "location"... will drop NA polygones in the midstone_cleaning_03.

In [None]:
### Df USA
ecos_end_species_us.describe()

Only one polygon data missing in the dataframe with only USA

In [None]:
### Df all countries
ecos_end_species_countries.describe()

Some countries are missing and a lot of polygon data as expected unfortunately...

In [None]:
###drop NA values in column country 
ecos_end_species_countries = ecos_end_species_countries[ecos_end_species_countries["country"].notnull()]

In [None]:
ecos_end_species_countries.describe()

### 11- images duplicate ( some images are for the same species but have a different image number and sometimes the image is absent):

### Dataframe US and Canada

In [None]:
ecos_end_species_us_ca.shape

In [None]:
### NB: there is only 434 unique scientific name which mean there is still some duplicates... comes from the images files!
ecos_end_species_us_ca["scientific_name"].nunique()

In [None]:
### function to check if image is available or not
def status_checker (url):
    request = requests.get(url)
    return request.status_code

In [None]:
### Apply the function in my Df US-CA only
ecos_end_species_us_ca["status_code"] = ecos_end_species_us_ca["image_url_full"].apply(status_checker)

In [None]:
### Super cool function where the sort_value will put on top the images available and then you run a drop_duplicate on scientific name so if the scientific name was already called the nex one will be droped
ecos_end_species_us_ca = ecos_end_species_us_ca.sort_values("status_code").drop_duplicates("scientific_name")

In [None]:
ecos_end_species_us_ca.shape

In [None]:
ecos_end_species_us_ca.describe()

In [None]:
### drop columns status code
ecos_end_species_us_ca = ecos_end_species_us_ca.drop(["status_code"], axis=1)

In [None]:
ecos_end_species_us_ca.head(2)

### Dataframe US only

In [None]:
### Apply the function in my Df US only
ecos_end_species_us["status_code"] = ecos_end_species_us["image_url_full"].apply(status_checker)

In [None]:
### remove images duplicates if any
ecos_end_species_us = ecos_end_species_us.sort_values("status_code").drop_duplicates("scientific_name")

In [None]:
ecos_end_species_us.shape

In [None]:
ecos_end_species_us.describe()

In [None]:
### drop columns status code
ecos_end_species_us = ecos_end_species_us.drop(["status_code"], axis=1)

In [None]:
ecos_end_species_us.head(2)

### Dataframe all countries

In [None]:
### Apply the function in my Df US only
ecos_end_species_countries["status_code"] = ecos_end_species_countries["image_url_full"].apply(status_checker)

In [None]:
### remove images duplicates if any
ecos_end_species_countries = ecos_end_species_countries.sort_values("status_code").drop_duplicates("scientific_name")

In [None]:
ecos_end_species_countries.shape

In [None]:
ecos_end_species_countries.describe()

In [None]:
### drop columns status code
ecos_end_species_countries = ecos_end_species_countries.drop(["status_code"], axis=1)

In [None]:
ecos_end_species_countries.head(2)

### 9- Save the cleaned dataframes as new csv files:

In [None]:
### Save all countries and info df as a new CSV file
ecos_end_species_countries.to_csv('../data/ecos_end_species_countries_clean.csv', index = False)

In [None]:
### Save us-ca and info df as a new CSV file
ecos_end_species_us_ca.to_csv('../data/ecos_end_species_us_ca_clean.csv', index = False)

In [None]:
### Save us and info df as a new CSV file
ecos_end_species_us.to_csv('../data/ecos_end_species_us_clean.csv', index = False)

In [45]:
### Save states for USA as a new CSV file
ecos_states_us.to_csv('../data/ecos_states_us_clean.csv', index = False)

In [None]:
### Save countries for treemap as a new CSV file
ecos_end_species_treemap.to_csv('../data/ecos_countries_treemap_clean.csv', index = False)