# GBIF dataset

We have used the *EOD – eBird Observation Dataset* from GBIF, the Global Biodiversity Information Facility


In [2]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# loading EOD dataset
reading = pd.read_csv('gbif_eod.csv', 
                   sep = '\t', 
                   error_bad_lines = False)

In [4]:
# working with a copy
gbif = reading.copy()

In [5]:
# displaying all columns
pd.set_option('display.max_columns', None)
gbif.head()

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,infraspecificEpithet,taxonRank,scientificName,verbatimScientificName,verbatimScientificNameAuthorship,countryCode,locality,stateProvince,occurrenceStatus,individualCount,publishingOrgKey,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,coordinatePrecision,elevation,elevationAccuracy,depth,depthAccuracy,eventDate,day,month,year,taxonKey,speciesKey,basisOfRecord,institutionCode,collectionCode,catalogNumber,recordNumber,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,1787081975,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,URN:catalog:CLO:EBIRD_ESP:OBS478457128,Animalia,Chordata,Aves,Pelecaniformes,Ardeidae,Egretta,Egretta garzetta,,SPECIES,"Egretta garzetta (Linnaeus, 1766)",Egretta garzetta,,ES,Quinto T.M.,Aragón,PRESENT,20.0,e2e717bf-551a-4917-bdc9-4fa0f342c530,41.42844,-0.502882,,,,,,,2012-03-03T00:00:00,3,3,2012,2480876,2480876,HUMAN_OBSERVATION,CLO,EBIRD_ESP,OBS478457128,,,,CC_BY_4_0,,obsr884804,,,2021-06-16T14:27:54.852Z,,
1,2057171899,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,URN:catalog:CLO:EBIRD:OBS489292176,Animalia,Chordata,Aves,Phoenicopteriformes,Phoenicopteridae,Phoenicopterus,Phoenicopterus roseus,,SPECIES,"Phoenicopterus roseus Pallas, 1811",Phoenicopterus roseus,,ES,El Hondo PNat--Área General,Valenciana Comunidad,PRESENT,30.0,e2e717bf-551a-4917-bdc9-4fa0f342c530,38.18585,-0.780973,,,,,,,2017-04-23T00:00:00,23,4,2017,4352332,4352332,HUMAN_OBSERVATION,CLO,EBIRD,OBS489292176,,,,CC_BY_4_0,,obsr424629,,,2021-06-16T14:27:59.243Z,,COORDINATE_ROUNDED
2,2065545864,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,URN:catalog:CLO:EBIRD:OBS491474054,Animalia,Chordata,Aves,Passeriformes,Fringillidae,Chloris,Chloris chloris,,SPECIES,"Chloris chloris (Linnaeus, 1758)",Chloris chloris,,ES,"36,7155x-5,2412 - 28 Apr 2017, 20:58",Andalucía,PRESENT,1.0,e2e717bf-551a-4917-bdc9-4fa0f342c530,36.7155,-5.241248,,,,,,,2017-04-28T00:00:00,28,4,2017,5845582,5845582,HUMAN_OBSERVATION,CLO,EBIRD,OBS491474054,,,,CC_BY_4_0,,obsr841249,,,2021-06-16T14:27:59.891Z,,
3,2099775253,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,URN:catalog:CLO:EBIRD:OBS559532012,Animalia,Chordata,Aves,Columbiformes,Columbidae,Columba,Columba livia,,SPECIES,"Columba livia J.F.Gmelin, 1789",Columba livia,,ES,Quintos de Mora,Castilla-La Mancha,PRESENT,,e2e717bf-551a-4917-bdc9-4fa0f342c530,39.398663,-4.074597,,,,,,,2017-11-15T00:00:00,15,11,2017,2495414,2495414,HUMAN_OBSERVATION,CLO,EBIRD,OBS559532012,,,,CC_BY_4_0,,obsr825751,,,2021-06-16T14:28:31.288Z,,COORDINATE_ROUNDED
4,2135744488,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,URN:catalog:CLO:EBIRD:OBS588686591,Animalia,Chordata,Aves,Passeriformes,Aegithalidae,Aegithalos,Aegithalos caudatus,,SPECIES,"Aegithalos caudatus (Linnaeus, 1758)",Aegithalos caudatus,,ES,Valencia--Jardines del Turia,Valenciana Comunidad,PRESENT,4.0,e2e717bf-551a-4917-bdc9-4fa0f342c530,39.477215,-0.370145,,,,,,,2018-03-16T00:00:00,16,3,2018,2495000,2495000,HUMAN_OBSERVATION,CLO,EBIRD,OBS588686591,,,,CC_BY_4_0,,obsr534120,,,2021-06-16T14:28:43.256Z,,COORDINATE_ROUNDED


There are some columns that don't give us relevant information: 
- gbifID
- datasetKey
- occurrenceID
- publishingOrgKey
- taxonKey
- speciesKey
- institutionCode
- collectionCode
- catalogNumber
- recordNumber
- identifiedBy
- dateIdentified
- license
- rightsHolder
- recordedBy
- typeStatus
- establishmentMeans
- lastInterpreted
- mediaType
- issue

We found some columns with one unique value, so we decided to drop them:
- kingdom
- phylum
- class
- taxonRank
- countryCode
- occurrenceStatus
- basisOfRecord

There are some columns with all non-value items: 
- infraspecificEpithet 
- verbatimScientificNameAuthorship
- coordinateUncertaintyInMeters
- coordinatePrecision
- elevation
- elevationAccuracy
- depth
- depthAccuracy

We decided to drop the scientific name and the verbatim scientific name and keep the specie, as both make reference to the same information
- scientificName
- verbatimScientificName


In [6]:
# dropping some columns 
to_drop = ['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 
           'class', 'infraspecificEpithet', 'taxonRank', 'scientificName',
           'verbatimScientificNameAuthorship', 'countryCode',
           'occurrenceStatus', 'publishingOrgKey', 'coordinateUncertaintyInMeters',
           'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth', 
           'depthAccuracy', 'taxonKey', 'speciesKey', 'basisOfRecord', 
           'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
           'identifiedBy', 'dateIdentified', 'license', 'rightsHolder','recordedBy',
           'typeStatus', 'establishmentMeans', 'verbatimScientificName',
           'lastInterpreted','mediaType', 'issue']
gbif.drop(to_drop, axis = 1, inplace = True)

In [7]:
# Now we have a 13 columns dataset
gbif.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12382263 entries, 0 to 12382262
Data columns (total 13 columns):
 #   Column            Dtype  
---  ------            -----  
 0   order             object 
 1   family            object 
 2   genus             object 
 3   species           object 
 4   locality          object 
 5   stateProvince     object 
 6   individualCount   float64
 7   decimalLatitude   float64
 8   decimalLongitude  float64
 9   eventDate         object 
 10  day               int64  
 11  month             int64  
 12  year              int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 1.2+ GB


In [8]:
# setting an index for the dataset
gbif.index = [x for x in range(1, len(gbif.values)+1)]

In [9]:
gbif.isna().sum()

order                     0
family                    0
genus                     0
species                   0
locality                144
stateProvince             0
individualCount     1879095
decimalLatitude           0
decimalLongitude          0
eventDate                 0
day                       0
month                     0
year                      0
dtype: int64

In [10]:
# We assume that NaN values in the "individualCount" column means that there was only one specimen
gbif.individualCount = gbif.individualCount.replace(np.nan, '1')

In [11]:
# we change the individualCount values to an integer
gbif.individualCount = gbif.individualCount.astype(int)

In [12]:
# Reassigning the columns titles
gbif = gbif.rename(columns = {'decimalLatitude': 'lat',
                              'decimalLongitude': 'long',
                             'stateProvince': 'region',
                             'individualCount': 'observations'})


In [13]:
gbif.head()

Unnamed: 0,order,family,genus,species,locality,region,observations,lat,long,eventDate,day,month,year
1,Pelecaniformes,Ardeidae,Egretta,Egretta garzetta,Quinto T.M.,Aragón,20,41.42844,-0.502882,2012-03-03T00:00:00,3,3,2012
2,Phoenicopteriformes,Phoenicopteridae,Phoenicopterus,Phoenicopterus roseus,El Hondo PNat--Área General,Valenciana Comunidad,30,38.18585,-0.780973,2017-04-23T00:00:00,23,4,2017
3,Passeriformes,Fringillidae,Chloris,Chloris chloris,"36,7155x-5,2412 - 28 Apr 2017, 20:58",Andalucía,1,36.7155,-5.241248,2017-04-28T00:00:00,28,4,2017
4,Columbiformes,Columbidae,Columba,Columba livia,Quintos de Mora,Castilla-La Mancha,1,39.398663,-4.074597,2017-11-15T00:00:00,15,11,2017
5,Passeriformes,Aegithalidae,Aegithalos,Aegithalos caudatus,Valencia--Jardines del Turia,Valenciana Comunidad,4,39.477215,-0.370145,2018-03-16T00:00:00,16,3,2018


In [14]:
# checking unique values in order to confirm dataset is clean
print('Order: ', gbif.order.unique())
print('Family: ', gbif.family.unique())
print('Genus: ', gbif.genus.unique())
print('Specie: ', gbif.species.unique())
print('Latitude: ', gbif.lat.unique())
print('Longitude: ', gbif.long.unique())
print('Regions: ', gbif.region.unique())
print('Observations: ', gbif.observations.unique())
print('Family: ', gbif.family.unique())
print('Year: ', gbif.year.unique())
print('Month: ', gbif.month.unique())
print('Day: ', gbif.day.unique())

Order:  ['Pelecaniformes' 'Phoenicopteriformes' 'Passeriformes' 'Columbiformes'
 'Charadriiformes' 'Podicipediformes' 'Anseriformes' 'Ciconiiformes'
 'Accipitriformes' 'Gruiformes' 'Bucerotiformes' 'Cuculiformes'
 'Suliformes' 'Apodiformes' 'Falconiformes' 'Psittaciformes'
 'Coraciiformes' 'Strigiformes' 'Piciformes' 'Galliformes' 'Otidiformes'
 'Procellariiformes' 'Pteroclidiformes' 'Caprimulgiformes' 'Gaviiformes'
 'Phaethontiformes']
Family:  ['Ardeidae' 'Phoenicopteridae' 'Fringillidae' 'Columbidae' 'Aegithalidae'
 'Hirundinidae' 'Laniidae' 'Laridae' 'Podicipedidae' 'Anatidae' 'Corvidae'
 'Scolopacidae' 'Ciconiidae' 'Muscicapidae' 'Remizidae' 'Accipitridae'
 'Motacillidae' 'Sylviidae' 'Passeridae' 'Rallidae' 'Upupidae'
 'Phylloscopidae' 'Sturnidae' 'Regulidae' 'Cinclidae' 'Troglodytidae'
 'Cuculidae' 'Turdidae' 'Prunellidae' 'Paridae' 'Threskiornithidae'
 'Cisticolidae' 'Cettiidae' 'Charadriidae' 'Certhiidae' 'Oriolidae'
 'Acrocephalidae' 'Phalacrocoracidae' 'Apodidae' 'Falconidae'

Latitude:  [41.42844  38.18585  36.7155   ... 42.364483 40.440502 41.97548 ]
Longitude:  [-0.502882 -0.780973 -5.241248 ... -7.433391 -3.704359 -6.14086 ]
Regions:  ['Aragón' 'Valenciana Comunidad' 'Andalucía' 'Castilla-La Mancha'
 'Cataluña' 'Extremadura' 'Illes Baleares' 'País Vasco' 'Castilla y León'
 'Cantabria' 'Asturias Principado de' 'Madrid Comunidad de'
 'Navarra Comunidad Foral de' 'Galicia' 'Ceuta' 'Murcia Región de'
 'Canarias' 'La Rioja' 'Melilla']
Observations:  [  20   30    1 ... 6195 3045 3665]
Family:  ['Ardeidae' 'Phoenicopteridae' 'Fringillidae' 'Columbidae' 'Aegithalidae'
 'Hirundinidae' 'Laniidae' 'Laridae' 'Podicipedidae' 'Anatidae' 'Corvidae'
 'Scolopacidae' 'Ciconiidae' 'Muscicapidae' 'Remizidae' 'Accipitridae'
 'Motacillidae' 'Sylviidae' 'Passeridae' 'Rallidae' 'Upupidae'
 'Phylloscopidae' 'Sturnidae' 'Regulidae' 'Cinclidae' 'Troglodytidae'
 'Cuculidae' 'Turdidae' 'Prunellidae' 'Paridae' 'Threskiornithidae'
 'Cisticolidae' 'Cettiidae' 'Charadriidae' 'Certhiida

In [15]:
# Dictionaty to change regions names
regions_dict = {'Valenciana Comunidad': 'Valencia',
 'Illes Baleares': 'Islas Baleares',
 'Asturias Principado de': 'Asturias',
 'Madrid Comunidad de': 'Madrid',
 'Navarra Comunidad Foral de': 'Navarra',
 'Murcia Región de': 'Murcia'}

In [16]:
# Replacing regions names
gbif['region'].replace(regions_dict, inplace = True)
gbif.region.unique()

array(['Aragón', 'Valencia', 'Andalucía', 'Castilla-La Mancha',
       'Cataluña', 'Extremadura', 'Islas Baleares', 'País Vasco',
       'Castilla y León', 'Cantabria', 'Asturias', 'Madrid', 'Navarra',
       'Galicia', 'Ceuta', 'Murcia', 'Canarias', 'La Rioja', 'Melilla'],
      dtype=object)

In [17]:
# checking the observations values
obs = np.sort(gbif.observations.unique())
print(len(obs))
obs[2600:]

2646


array([  20000,   21000,   21596,   21760,   22000,   23000,   23011,
         23100,   23600,   23800,   24000,   25000,   26000,   27000,
         27112,   27284,   28000,   30000,   31253,   33000,   35000,
         37000,   40000,   41500,   44000,   45000,   45390,   50000,
         50200,   55000,   56430,   60000,   75000,   80000,  100000,
        150000,  200000,  250000,  295000,  300000,  400000,  434508,
        500000,  800000, 1000000, 2000000])

In [18]:
gbif.to_csv('gbif.csv')

In [19]:
gbif_r = gbif[(gbif['year'] > 2000) & (gbif['year'] <= 2020)]
gbif_r.to_csv('gbif_r.csv')

In [20]:
gbif20 = gbif[gbif['year'] == 2020]
gbif20.to_csv('gbif20.csv')