# Exercises - Pollinators datasets exploration

Exercises with some pollinators datasets.

## Packages import

In [9]:
import os # operating system functions
import requests # web requests
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt # data visualization
import seaborn as sb # data visualization
import graphviz # grahp visualization
from sklearn.model_selection import StratifiedShuffleSplit # dataset subsetting 
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import LabelEncoder # mange categorical data
from sklearn import metrics # results evaluation


We probably will download and save more than 1 datase so let's make a funcition for it

In [10]:
def DatasetDownload(dataset_url, dataset_directory_path, dataset_file_name):
    print("Download started")
    request_dataset = requests.get(dataset_url, allow_redirects=True)
    print("Download completed")
    if request_dataset.status_code != 200:
        print(f"Request status: {request_dataset.status_code}")
    else:
        print("Writing started")
        os.makedirs(dataset_directory_path, exist_ok=True)
        open( dataset_directory_path + dataset_file_name , 'wb').write(request_dataset.content)
        print("Writing completed")
    print("End")
    return



## Insect Pollinator Initiative -  Natural History Museum Data Portal 




Graham N Stone; Alfried Vogler; Adam Vanbergen; Jacqueline Mackenzie-Dodds (2017). Dataset: Insect Pollinators Archive. Resource: Insect Pollinator Initiative. Natural History Museum Data Portal (data.nhm.ac.uk). https://doi.org/10.5519/0062900


Retrieved: 16:39 19 Mar 2022 (GMT)

### IPI-NHMDP - Data download - (One shoot execution)

Let's use the original website.

Next steps are "one shoot execution", you should execute it only the first time, once did it you can go directly to *Starting points* that youll'find along the code.


In [11]:
# Dataset url
NHMDP_PI_dataset_url = 'https://data.nhm.ac.uk/dataset/46e122c6-7acd-44ec-a354-81a412da419a/resource/784d74b6-6b0e-4fd4-b0b5-798ac7b1a11b/download/ipifordataportal.xlsx'

# Desired directory
NHMDP_PI_dataset_directory = 'Datasets/Pollinators/NHMDP/PollinatorsInitiative'

# Desired file name
NHMDP_PI_dataset_name = 'PollinatorsInitiative.xlsx'


In [12]:
# Download and Save
DatasetDownload(NHMDP_PI_dataset_url, NHMDP_PI_dataset_directory, NHMDP_PI_dataset_name)


Download started
Download completed
Writing started
Writing completed
End


### IPI-NHMDP - Data import - Starting point

In [13]:
IPI_NHMDP_dataset = pd.read_excel(NHMDP_PI_dataset_directory+NHMDP_PI_dataset_name, engine='openpyxl')

### IPI-NHMDP - Exploration

In [14]:
IPI_NHMDP_dataset.describe()

Unnamed: 0,Specimen No/Barcode
count,11854.0
mean,10066050.0
std,7403.999
min,10052460.0
25%,10059630.0
50%,10068860.0
75%,10071820.0
max,10075980.0


In [15]:
IPI_NHMDP_dataset.head()

Unnamed: 0,Project Name,Specimen No Prefix,Specimen No/Barcode,Specimen Code,Country,Province/State/Territory,District/County/Shire,Precise Locality,Coll Date,Method,Collector,Collector 1,Collector 2,Identifier,Determination,SEX,Stage
0,Insect Pollinator Initiative - agriland,NHMUK,10052460,AL_11_01750,United Kingdom,England,West Yorkshire,Harden Moor,2011-06-27,Pan trap,M. McKerchar,M McKerchar,,S P M Roberts,"Lasioglossum cupromicans (Pérez, J., 1903)",Female,
1,Insect Pollinator Initiative - agriland,NHMUK,10052461,AL_11_01751,United Kingdom,England,West Yorkshire,Harden Moor,2011-06-27,Pan trap,M. McKerchar,M McKerchar,,S P M Roberts,"Lasioglossum cupromicans (Pérez, J., 1903)",Female,
2,Insect Pollinator Initiative - agriland,NHMUK,10052462,AL_11_01753,United Kingdom,England,West Yorkshire,Harden Moor,2011-06-27,Pan trap,M. McKerchar,M McKerchar,,S P M Roberts,"Lasioglossum cupromicans (Pérez, J., 1903)",Female,
3,Insect Pollinator Initiative - agriland,NHMUK,10052463,AL_11_01754,United Kingdom,England,West Yorkshire,Harden Moor,2011-06-27,Pan trap,M. McKerchar,M McKerchar,,S P M Roberts,"Lasioglossum cupromicans (Pérez, J., 1903)",Female,
4,Insect Pollinator Initiative - agriland,NHMUK,10052464,AL_11_01755,United Kingdom,England,West Yorkshire,Harden Moor,2011-06-27,Pan trap,M. McKerchar,M McKerchar,,S P M Roberts,"Lasioglossum fratellum (Perez, 1903)",Female,


In [17]:
IPI_NHMDP_dataset.columns

Index(['Project Name', 'Specimen No Prefix', 'Specimen No/Barcode',
       'Specimen Code', 'Country', 'Province/State/Territory',
       'District/County/Shire', 'Precise Locality', 'Coll Date', 'Method',
       'Collector', 'Collector 1', 'Collector 2', 'Identifier',
       'Determination', 'SEX', 'Stage'],
      dtype='object')