# Dataset Scraping

## Imports

In [1]:
from dwca.read import DwCAReader
from dwca.darwincore.utils import qualname as qn
import requests
import shutil
import os
import time
import math
import pandas
import json
import zipfile
from tqdm import tqdm
from glob import glob
import multiprocessing as mp


## Globals

In [4]:
timestr = time.strftime("%Y%m%d-%H%M%S")

PERCENT_TO_SCRAPE = 0.00015
NUMBER_TO_SKIP = 40000
DATASET_PATH = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/"
DATASET_ARCHIVE = "data.zip"
DATASET_CSV = "data.csv"
OUTPUT_PATH = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/scraped-data/" + "drago_testdata" + "/"
DATASET_URL = "https://occurrence-download.gbif.org/occurrence/download/request/0196625-210914110416597.zip"
DATASET_TYPE = ""
NUM_CORES = 50




## Download Dataset
#### Only run this if the dataset needs to be redownladed

In [209]:
if os.path.exists(DATASET_PATH + DATASET_ARCHIVE):
    os.remove(DATASET_PATH + DATASET_ARCHIVE)
ds = requests.get(DATASET_URL, stream=True)
total_size_in_bytes = int(ds.headers.get("content-length", 0))
block_size = 1024  # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(DATASET_PATH + DATASET_ARCHIVE, "wb") as f:
    for data in ds.iter_content(block_size):
        progress_bar.update(len(data))
        f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("ERROR, something went wrong")


 10%|▉         | 113M/1.16G [00:20<03:10, 5.52MiB/s]
100%|██████████| 1.16G/1.16G [01:48<00:00, 10.7MiB/s]


## For DWCA files

### Open DWCA File

In [21]:
dwca = DwCAReader(DATASET_PATH + DATASET_ARCHIVE)
DATASET_TYPE = "dwca"


### Test DWCA
Will throw an error if the file is not opened correctly.

In [22]:
dwca.get_corerow_by_position(0)


<dwca.rows.CoreRow at 0x7f9d5b41efa0>

### Save DWCA Rows to Pandas Dataframe

In [23]:
# df = dwca.pd_read('occurrence.txt')
df = dwca.pd_read("occurrence.txt", low_memory=False)


In [None]:
def print_pandas_column_names(df):
    colnames = []
    for col in df.columns:
        colnames.append(col)
    print(colnames)

#### Close the archive to free resources


In [8]:
dwca.close()


In [11]:
df.describe()

Unnamed: 0,id,abstract,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,conformsTo,...,acceptedTaxonKey,kingdomKey,phylumKey,classKey,orderKey,familyKey,genusKey,subgenusKey,speciesKey,relativeOrganismQuantity
count,84095.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,84095.0,84095.0,84095.0,84095.0,84095.0,84094.0,84016.0,0.0,81909.0,0.0
mean,2214488000.0,,,,,,,,,,...,4272101.0,6.0,7707728.0,12165.76,378890.0,69456.61,3744964.0,,4197946.0,
std,461177200.0,,,,,,,,,,...,1850369.0,0.0,0.0,293637.7,1607552.0,527730.1,1844436.0,,1809612.0,
min,1989713000.0,,,,,,,,,,...,2382.0,6.0,7707728.0,194.0,392.0,2366.0,2650108.0,,2650247.0,
25%,1998682000.0,,,,,,,,,,...,3013672.0,6.0,7707728.0,220.0,640.0,3925.0,2960439.0,,3013580.0,
50%,1999077000.0,,,,,,,,,,...,3172326.0,6.0,7707728.0,220.0,691.0,5015.0,3013395.0,,3152047.0,
75%,2012887000.0,,,,,,,,,,...,5354470.0,6.0,7707728.0,220.0,1351.0,6677.0,3172323.0,,5334240.0,
max,3880670000.0,,,,,,,,,,...,11400750.0,6.0,7707728.0,7228684.0,7561687.0,10411380.0,11400750.0,,11379030.0,


In [12]:
df.shape

(84095, 258)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84095 entries, 0 to 84094
Columns: 258 entries, id to geodeticDatum
dtypes: bool(2), float64(161), int64(8), object(87)
memory usage: 164.4+ MB


## For CSV files

### Extract CSV from zip file

In [9]:
# Run this only if you don't have csv file, and need to extract from data.zip that you just downloaded
if os.path.exists(DATASET_PATH + "*.csv"):
    os.remove(DATASET_PATH + "*.csv")
with zipfile.ZipFile(DATASET_PATH + DATASET_ARCHIVE, "r") as zip_ref:
    zip_ref.extractall(DATASET_PATH)
f = glob(os.path.join(DATASET_PATH, "*-*.csv"))[0]
os.rename(f, DATASET_PATH + DATASET_CSV)
print("CSV file extracted")


IndexError: list index out of range

### Save CSV Rows to Pandas Dataframe

In [14]:
df = pandas.read_csv(DATASET_PATH + DATASET_CSV, sep="\t")
DATASET_TYPE = "csv"


  df = pandas.read_csv(DATASET_PATH + DATASET_CSV, sep="\t")


## Print Pandas Column Names

In [15]:
colnames = []
for col in df.columns:
    colnames.append(col)
print(colnames)


['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'infraspecificEpithet', 'taxonRank', 'scientificName', 'verbatimScientificName', 'verbatimScientificNameAuthorship', 'countryCode', 'locality', 'stateProvince', 'occurrenceStatus', 'individualCount', 'publishingOrgKey', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day', 'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord', 'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber', 'identifiedBy', 'dateIdentified', 'license', 'rightsHolder', 'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted', 'mediaType', 'issue']


## Get Images

### Export GBIF Ocurrence IDs

In [16]:
df.shape

(7982741, 50)

In [17]:
data = {}

NUMBER_TO_SKIP = math.floor(df.shape[0] / (df.shape[0] * PERCENT_TO_SCRAPE))
NUMBER_TO_SCRAPE = math.ceil(df.shape[0] / NUMBER_TO_SKIP)
print(str(NUMBER_TO_SCRAPE) + " IDs will be scraped.")
for i in range(1, df.shape[0], NUMBER_TO_SKIP):
    if DATASET_TYPE == "dwca":
        id = df.at[i, "id"]
    elif DATASET_TYPE == "csv":
        id = df.at[i, "gbifID"]
    data[i] = {"id": str(id)}
print("Successfully scraped " + str(len(data)) + " IDs.")


1198 IDs will be scraped.
Successfully scraped 1198 IDs.


### Fetch Image URLs and Specimen Data from GBIF API

In [20]:
rq = requests.get("https://api.gbif.org/v1/occurrence/" + str(data[key]["id"]))

NameError: name 'key' is not defined

In [19]:
print("Data will be fetched for", len(data), "occurrences.")


def scrape_occurrence(key):
    rq = requests.get("https://api.gbif.org/v1/occurrence/" + str(data[key]["id"]))
    return_dict = {}
    return_dict[key] = {}
    return_dict[key]["img_url"] = json.loads(rq.content)["media"][0]["identifier"]
    return_dict[key]["img_type"] = json.loads(rq.content)["media"][0]["format"]
    return_dict[key]["country"] = json.loads(rq.content)["country"]
    return_dict[key]["genus"] = json.loads(rq.content)["genus"]
    return_dict[key]["species"] = json.loads(rq.content)["species"]
    return return_dict

print("Starting multiprocessing...")
pool = mp.Pool(NUM_CORES)
print("Fetching data...")
for item in tqdm(pool.imap(scrape_occurrence, data), total=len(data)):
    data.update(item)
pool.close()
pool.join()

print("\nSuccessfully fetched data for", len(data), "occurrences.")


Data will be fetched for 1198 occurrences.
Starting multiprocessing...


  0%|          | 0/1198 [00:00<?, ?it/s]

Fetching data...





KeyError: 'id'

### Download Images

In [427]:
def download_images(key):
    img = requests.get(data[key]["img_url"], stream=True)
    with open(
        OUTPUT_PATH + str(key) + "." + data[1]["img_type"].split("/", 1)[1], "wb"
    ) as f:
        shutil.copyfileobj(img.raw, f)


if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

print("Starting multiprocessing...")
pool = mp.Pool(NUM_CORES)
print("Downloading images...")
for _ in tqdm(pool.imap(download_images, data), total=len(data)):
    pass
pool.close()


Starting multiprocessing...
Downloading images...


100%|██████████| 1198/1198 [00:22<00:00, 54.02it/s]


### Export Geograpy Data

In [428]:
with open(OUTPUT_PATH + "countries.txt", "w") as f:
    for idx in data:
        f.write(data[idx]["country"] + "\n")
print("\nSuccessfully wrote countries to file.")



Successfully wrote countries to file.


### Export Taxon Data

In [429]:
with open(OUTPUT_PATH + "taxon.txt", "w") as f:
    for idx in data:
        f.write(data[idx]["genus"] + " " + data[idx]["species"] + "\n")
print("\nSuccessfully wrote taxon to file.")



Successfully wrote taxon to file.
