# Dataset Scraping

## Imports

In [181]:
from dwca.read import DwCAReader
from dwca.darwincore.utils import qualname as qn
import requests
import shutil
import os
import time
import math
import pandas
import mimetypes
import json
import zipfile


## Globals

In [182]:
timestr = time.strftime("%Y%m%d-%H%M%S")

PERCENT_TO_SCRAPE = 0.00002
NUMBER_TO_SKIP = 40000
DATASET_PATH = "/projectnb/sparkgrp/ml-herbarium-data/"
DATASET_ARCHIVE = "data.zip"
DATASET_CSV = "data.csv"
OUTPUT_PATH = "/projectnb/sparkgrp/ml-herbarium-data/scraped-data/" + timestr + "/"
DATASET_URL = (
    "https://api.gbif.org/v1/occurrence/download/request/0195391-210914110416597.zip"
)
DATASET_TYPE = ""


## Download Dataset
#### Only run this if the dataset needs to be redownladed

In [None]:
if os.path.exists(DATASET_PATH+DATASET_ARCHIVE):
    os.remove(DATASET_PATH+DATASET_ARCHIVE)
ds = requests.get(DATASET_URL, stream=True)
with open(DATASET_PATH+DATASET_ARCHIVE, "wb") as f:
    shutil.copyfileobj(ds.raw, f)


## For DWCA files

### Open DWCA File

In [192]:
dwca = DwCAReader(DATASET_PATH+DATASET_ARCHIVE)
DATASET_TYPE = "dwca"


### Test DWCA
Will throw an error if the file is not opened correctly.

In [6]:
dwca.get_corerow_by_position(0)


<dwca.rows.CoreRow at 0x7f226d896e80>

### Save DWCA Rows to Pandas Dataframe

In [7]:
# df = dwca.pd_read('occurrence.txt')
df = dwca.pd_read("occurrence.txt", low_memory=False)


#### Close the archive to free resources


In [8]:
dwca.close()

## For CSV files

### Save CSV Rows to Pandas Dataframe

In [None]:
with zipfile.ZipFile(DATASET_PATH+DATASET_ARCHIVE, "r") as zip_ref:
    zip_ref.extractall(DATASET_PATH+DATASET_CSV)
pandas.read_csv(DATASET_PATH+DATASET_CSV)
DATASET_TYPE = "csv"

## Print Pandas Column Names

In [120]:
colnames = []
for col in df.columns:
    colnames.append(col)
print(colnames)


['id', 'abstract', 'accessRights', 'accrualMethod', 'accrualPeriodicity', 'accrualPolicy', 'alternative', 'audience', 'available', 'bibliographicCitation', 'conformsTo', 'contributor', 'coverage', 'created', 'creator', 'date', 'dateAccepted', 'dateCopyrighted', 'dateSubmitted', 'description', 'educationLevel', 'extent', 'format', 'hasFormat', 'hasPart', 'hasVersion', 'identifier', 'instructionalMethod', 'isFormatOf', 'isPartOf', 'isReferencedBy', 'isReplacedBy', 'isRequiredBy', 'isVersionOf', 'issued', 'language', 'license', 'mediator', 'medium', 'modified', 'provenance', 'publisher', 'references', 'relation', 'replaces', 'requires', 'rights', 'rightsHolder', 'source', 'spatial', 'subject', 'tableOfContents', 'temporal', 'title', 'type', 'valid', 'institutionID', 'collectionID', 'datasetID', 'institutionCode', 'collectionCode', 'datasetName', 'ownerInstitutionCode', 'basisOfRecord', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'occurrenceID', 'catalogNumber', 'rec

## Get Images

### Export GBIF URLs

In [170]:
data = {}

NUMBER_TO_SKIP = math.floor(df.shape[0] / (df.shape[0] * PERCENT_TO_SCRAPE))
NUMBER_TO_SCRAPE = math.ceil(df.shape[0] / NUMBER_TO_SKIP)
print(str(NUMBER_TO_SCRAPE) + " IDs will be scraped.")
for i in range(1, df.shape[0], NUMBER_TO_SKIP):
    id = df.at[i, 'id']
    if DATASET_TYPE == "dwca":
        data[i] = {'id':str(id)}
    elif DATASET_TYPE == "csv":
        data[i] = {'id':str(id)}
print('Successfully scraped ' + str(len(data)) + ' IDs.')

160 IDs will be scraped.
Successfully scraped 160 IDs.


### Fetch Image URLs and Specimen Data from GBIF API

In [179]:
print('Data will be fetched for', len(data), 'occurrences.')
i = 1
for idx in data:
    print("\rProgress: " + str(i)+'/'+str(len(data)), end="")
    rq = requests.get('https://api.gbif.org/v1/occurrence/' + str(data[idx]['id']))
    data[idx]['img_url']=(json.loads(rq.content)['media'][0]['identifier'])
    data[idx]['img_type']=(json.loads(rq.content)['media'][0]['format'])
    data[idx]['country']=(json.loads(rq.content)['country'])
    data[idx]['genus']=(json.loads(rq.content)['genus'])
    data[idx]['species']=(json.loads(rq.content)['species'])
    i+=1
print('\nSuccessfully fetched data for', len(data), 'occurrences.')

Data will be fetched for 160 occurrences.
Progress: 160/160Successfully fetched data for 160 occurrences.


### Download Images

In [188]:
i=1
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)
for idx in data:
    img = requests.get(data[idx]['img_url'], stream=True)
    with open(OUTPUT_PATH+str(idx)+mimetypes.guess_extension(data[idx]['img_type']),'wb') as f:
        shutil.copyfileobj(img.raw, f)
    print("\rProgress: " + str(i)+'/'+str(len(data)), end="")
    i+=1


Progress: 160/160

### Export Geograpy Data

In [189]:
with open(OUTPUT_PATH+'countries.txt', 'w') as f:
    for idx in data:
        f.write(data[idx]['country']+'\n')
print('\nSuccessfully wrote countries to file.')



Successfully wrote countries to file.


### Export Taxon Data

In [190]:
with open(OUTPUT_PATH+'taxon.txt', 'w') as f:
    for idx in data:
        f.write(data[idx]['genus']+' '+data[idx]['species']+'\n')
print('\nSuccessfully wrote taxon to file.')


Successfully wrote taxon to file.
