In [1]:
## WebScraping, Address to Coordinates Conversion, Data Cleaning, and Google Maps Plotting. ##

In [2]:
#import required modules for the first section
from bs4 import BeautifulSoup
from selenium import webdriver #import module to obtain data with source code from a website
#import requests #import request module to obtain data without source code info from a website: ANOTHER METHOD# 
import time #used to pause loop while page is loading/scrolling
import pandas as pd #used to store information
import numpy as np

In [3]:
#Open up a window for the program to browse HTML data and begin webscrapping#

#**KEEP THIS SEPERATE**, MUST DOWNLOAD FILE AND PLACE INTO FOLDER
#Make sure that a window is always open
browser = webdriver.Chrome() #opens up a single Chrome window# 

#Note, if you run this statement more than once, more than one window will pop up#

In [4]:
#Extract the HTML data of Species name and location from the iNaturalist Website 

##User Parameters of what is data is desired by consumers##
numb_pages = 5 #number of pages user wants to extract from
species_id = 4202 #find the species ID number at the end of the URL labeled taxon_id = xxx
species_id = str(species_id) #convert int to str
location_clean = [] #array for cleaner extracted information
names = [] #list of species names in order

##Go through n-amount of pages and scroll through each page and obtain the data once it is at the end of the page. 
for page in range(1, numb_pages + 1): #go through n-amount of pages
    
    #load up the page at specific page number#
    browser.get('https://www.inaturalist.org/observations?page=' + str(page) + '&place_id=any&subview=table&taxon_id='+species_id)

    #Scroll down the page many times since all of the HTML code is not present
    for i in range(10): #how many times should you scroll down the page? 10 itertaions of pressing the end button is guaranteed to obtain all data on the page
        if i == 1:
            time.sleep(5) #it requires time to scroll load the page, user should change this based on computer/internet performace
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") #scroll down the page to the end
        else:
            time.sleep(0.7) #it requires time to scroll and load the page
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") #scroll the page to the end

    #Extract all HTML code from the page via beautifupsoup
    soup = BeautifulSoup(browser.page_source, "lxml") #create an instance based on the data retrived from website
    
    #Find specific classes in the HTML code associated with the information you're trying to obtain. 
    location_extract = soup.find_all('span',class_ = "location") ##extract location information 
    species_names = soup.find_all("a", class_ = "display-name comname") #THIS WAS MODIFIED AND WORKS PERFECT
    len_location = 0 #number of locations obtained
    
    #iterate through the specific HTML code and create two lists for series name and location
    for loc in location_extract: #extract each finding from find_all
        location_clean.append(loc["title"]) #add location info to list
        print(loc["title"]) #display the locations being extracted to compare to website. This works perfectly.
        len_location += 1 #increment by 1, this is used to stop obtaining data from Soup
    
    for p,name in enumerate(species_names):
        if p == len_location:
            break #this is used to not obtain any more data that is not useable. This allows less cleaning of the data. 
        else:
            names.append(name.text) #add species name to a list
            
        
#Make sure both location name length and species name length are equal. 
print("Total amount of locations:{}".format(len(location_clean))) 
print("Total amount of species:{}".format(len(names)))


Drake Passage
Drake Passage
-60.386337,-67.075401
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
Drake Passage
-70.45243,-178.2946833333
-67.9783582949,-178.7705024183
Adelaide, Antarctica
-63.3380679949,-56.777184739
-61.5,-59.83
-63.9373724679,-49.306640625
-72.644913,-178.945312
-71.524909,-176.835937
near Deception Island, Antarctica
Ross Sea, Antarctica
Southern Ocean, Antarctica
Drake Passge





Total amount of locations:32
Total amount of species:32


In [5]:
#Create two series for species names and location
loc_clean = pd.Series(location_clean) #species location series
name_clean = pd.Series(names) #name of species series

In [6]:
#Create the DataFrame and clean the data.
locname = pd.DataFrame({"Location" : loc_clean, "Species" :name_clean}) #create a dataframe
locname

Unnamed: 0,Location,Species
0,Drake Passage,Species Antarctic Petrel\n
1,Drake Passage,Species Antarctic Petrel\n
2,"-60.386337,-67.075401",Species Antarctic Petrel\n
3,Drake Passage,Species Antarctic Petrel\n
4,Drake Passage,Species Antarctic Petrel\n
5,Drake Passage,Species Antarctic Petrel\n
6,Drake Passage,Species Antarctic Petrel\n
7,Drake Passage,Species Antarctic Petrel\n
8,Drake Passage,Species Antarctic Petrel\n
9,Drake Passage,Species Antarctic Petrel\n


In [7]:
#Clean the data#

#Drop empty data
locname = locname.replace("",float("NaN")) #replace any empty data with NaN
numb_rawdata = len(locname.dropna()) #total number of addresses extracted
locname = locname.dropna()#drop any missing data since this would waste time on the geocoder
numb_noaddress = len(locname) #number of valid addresses with no missing data

#Clean the species name info for clarity
locname["Species"] = locname["Species"].apply(lambda x:x.replace("Species",""))
locname["Species"] = locname["Species"].apply(lambda x:x.replace("\n",""))
locname["Species"] = locname["Species"].apply(lambda x:x.replace("Subspecies",""))

#clean the location/address information
locname["Location"] = locname["Location"].apply(lambda x:x.replace("near ",""))


In [8]:
locname #view the data

Unnamed: 0,Location,Species
0,Drake Passage,Antarctic Petrel
1,Drake Passage,Antarctic Petrel
2,"-60.386337,-67.075401",Antarctic Petrel
3,Drake Passage,Antarctic Petrel
4,Drake Passage,Antarctic Petrel
5,Drake Passage,Antarctic Petrel
6,Drake Passage,Antarctic Petrel
7,Drake Passage,Antarctic Petrel
8,Drake Passage,Antarctic Petrel
9,Drake Passage,Antarctic Petrel


In [9]:
#Turn Location data into coordinates/address and then insert into original dataframe using imported modules below
import geopy #pip install geopy
from geopy.geocoders import Nominatim #this is used to convert Address to Coordinates and vise-versa
geolocator = Nominatim(user_agent="SBU_Mapp")#create a user to use the program as this is a critical step since the program wont work

In [10]:
##CONVERT ADDRESS/COORDINATE TO A CLEAN ADDRESS AND COORDINATES
location_geo = [] ##this takes a long time to convert the coordinates and there is a limit on how much can be done. 
location_list_ordered = locname.Location.to_list()
numb_coverted2cord = 0
numb_originalcoord = 0
numb_addressnotconvert = 0
try: #if any exception occurs, take the data converted and plot it. Usual exception is an http exception when the program limit is reached.
    for i in range(len(location_list_ordered)): #go through entire list of locations obtain from Beautiful Soup
        time.sleep(1.3) #time must be greater than 1 second, due to the many requests. If there is not a lot of data, you can comment this out. 
        location_geo.append(geolocator.geocode(location_list_ordered[i])) #convert address to coordinates
        if location_geo[i] == None: #if the geocoder returns nothing, then obtain address or coordinates from list obtain via Beautiful Soup
            
            #Determine if the data is an address or a coordinate 
            #if an error is thrown, then it is an address because an address doesn't contain a number at the end of street name. 
            try: #determine if address or coordinate using try/except statement. 
                x = float(location_list_ordered[i][-1]) #determines if this is an address of a coordinate
                coord_temp = (float(location_list_ordered[i].split(",")[0]), float(location_list_ordered[i].split(",")[1]))#create a tuple of the coordinates
                location_geo[i] = [None, coord_temp] #no address, provide only a coordinate
                numb_originalcoord += 1 #number of coordinates submitted as an address
            #if value error is raised, continue with this statement after it fails at x above. 
            except ValueError: #it is an address
                location_geo[i] = [location_list_ordered[i],None] #no coordinate, provide only an address. 
                numb_addressnotconvert +=1 #number of addresses not converted to coordinates
                continue
        #if the address is converted to a coordinate, count the number of times it was converted. 
        else:
            numb_coverted2cord += 1 #number of times something was successfully converted

#this occurs when multiple errors occur 
#this allow the program to run and delete rest of the addresses not utilized. 
except: #this will be utilized when the geolocator program limit has been met and no more requests can be made
    locname_temp = locname #original locname used for storage for future use
    locname = locname[0:len(location_geo)]#reduce the length of locname since location_geo and locname need to be the same size
    

In [11]:
#Add cleanred and converted address data into original dataframe
locname.insert(2,"Location_Coord",location_geo) #insert the address/coord from geocoder, much cleaner/readable, into the dataframe 
location_geo = locname.Location_Coord.to_list() #obtain address and convert it into a list


In [12]:
##Create seperate columns with address and coordinates from the encoder
address = [] #address
coordinates = [] #both lat/long coord
coord_lat = [] #latitudinal coordinates, used to store integer number from csv file
coord_long = [] #longitudinal coordinates, used to store integer number from csv file

#seperate coordinates into Latitude and Longitude 
for i in range(len(location_geo)):
    address.append(location_geo[i][0]) #obtain new/cleaner address information
    coordinates.append(location_geo[i][1]) #obtain full coordinates 
    #check if coordinates are not present, turn into a blank string if None is present
    if location_geo[i][1] == None: #turn None into a string so that an error does not occur
        coord_lat.append("")
        coord_long.append("")
    #if there are coordinates present, convert into lat/long for future use of excel file
    else:    
        coord_lat.append(location_geo[i][1][0]) #if it is not None, then add the coordinate to coord lat
        coord_long.append(location_geo[i][1][1]) #if it is not None, then add the coordinate to coord long

In [13]:
#organize the dataframe with the new data and delete non-essential data
locname.insert(3,"Address",address) #insert cleaned address from geocoder
locname.insert(4,"Coordinates",coordinates) #insert coordinates from geocoder
#save and view previous information in case it is needed later on
locname #view before deleting Location Coord and Location
locname_temporary = locname #store all data incase it is needed later. 
#drop location/coordinates not from th geocoder
locname = locname.drop(labels = "Location_Coord", axis = 1) #this is no longer needed.
#drop non-converted addresses 
locname = locname.drop(labels = "Location", axis = 1) #this is no longer needed.
#insert new coordinates into excel sheet
locname.insert(3,"Coord_Lat",coord_lat)
locname.insert(4,"Coord_long",coord_long)

In [14]:
#send data to excel sheet for future use
locname_species = locname.sort_values("Species") #sort species values
from datetime import datetime #import time on computer for filename use
now = datetime.now() #determine the time once this is executed. 
current_time = now.strftime("%H%M%S") #provide the formate of the time
print("Current Time =", current_time) #print to compare to time this line was executed with computer time. 
locname_species.to_excel(locname.Species[0]+current_time+".xlsx")#Send data to excel sheet

Current Time = 052750


In [15]:
#plot coordinates using google

import gmaps #use pip install gmaps to install first before you enable extension

#Use the two enable statements below to show the google maps in Juptyer Notebook
#--------------------
#jupyter nbextension enable --py gmaps
#jupyter nbextension enable --py widgetsnbextension
#--------------------

gmaps.configure(api_key="AIzaSyBdQG21eLeWOqG_cflQr1RyxzOyOGA9yfg") #PLEASE DO NOT SHARE THIS KEY WITH ANYONE/DELETE THIS KEY WHEN IT IS NO LONGER NEEDED.

fig = gmaps.figure(map_type='SATELLITE')#create a figure of a Satellite view 

# generate some (latitude, longitude) pairs and remove any empty data types
location = [i for i in locname.Coordinates.to_list() if i] #remove all None type from coordinates section

heatmap_layer = gmaps.heatmap_layer(location) #create a heatmap
fig.add_layer(heatmap_layer) #create a layer of the heatmap
fig #display figure



Figure(layout=FigureLayout(height='420px'))

In [16]:
#create an HTML file of the map and save it to the computer for future use. 
from ipywidgets.embed import embed_minimal_html #used to export an HTML of the map
embed_minimal_html(locname.Species[0]+current_time+".html", views=[fig]) #export HTML of map

In [17]:
final = len(location) #number of coordinates that were plotted on the map

In [18]:
#Create a results section to show what was successfully converted or not. 

print("-------------------------------\nResults Section\n-------------------------------\nTotal Addresses/Coords pulled: {}\nTotal Address/Coord no missing data {}\nTotal Coordinates Submitted {}\nTotal # of Addresses not converted {}\nTotal # of Addresses Converted {}\nTotal # of locations dropped due to failure of encoder {}\nPoints used in heatmap {} ".format(numb_rawdata,numb_noaddress, numb_originalcoord,numb_addressnotconvert,numb_coverted2cord,numb_rawdata-numb_addressnotconvert-numb_coverted2cord,final))

-------------------------------
Results Section
-------------------------------
Total Addresses/Coords pulled: 27
Total Address/Coord no missing data 27
Total Coordinates Submitted 8
Total # of Addresses not converted 4
Total # of Addresses Converted 15
Total # of locations dropped due to failure of encoder 8
Points used in heatmap 23 
