# Dataset Scraping

## Imports

In [None]:
! pip install python-dwca-reader

Collecting python-dwca-reader
  Downloading python_dwca_reader-0.15.0-py3-none-any.whl (37 kB)
Installing collected packages: python-dwca-reader
Successfully installed python-dwca-reader-0.15.0


In [None]:
from dwca.read import DwCAReader
from dwca.darwincore.utils import qualname as qn
import requests
import shutil

## Globals

In [None]:
NUMBER_TO_SCRAPE = 10
DATASET_PATH = '/projectnb/sparkgrp/ml-herbarium-data/data.zip'
OUTPUT_PATH = '/projectnb/sparkgrp/ml-herbarium-data/scraped-data/'

## Mount Google Drive
**Ignore unless running on Google Colab**

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


## Open DWCA File

In [None]:
dwca = DwCAReader(DATASET_PATH)

## Test DWCA

In [None]:
print(dwca.get_corerow_by_position(0))

--
Rowtype: http://rs.tdwg.org/dwc/terms/Occurrence
Position: 0
Source: Core file
Row id: 3703067559
Reference extension rows: Yes
Reference source metadata: No
Data: {'http://rs.tdwg.org/dwc/terms/geodeticDatum': 'WGS84', 'http://rs.gbif.org/terms/1.0/gbifID': '3703067559', 'http://purl.org/dc/terms/abstract': '', 'http://purl.org/dc/terms/accessRights': 'https://huh.harvard.edu/access-digital-reproductions-works-public-domain', 'http://purl.org/dc/terms/accrualMethod': '', 'http://purl.org/dc/terms/accrualPeriodicity': '', 'http://purl.org/dc/terms/accrualPolicy': '', 'http://purl.org/dc/terms/alternative': '', 'http://purl.org/dc/terms/audience': '', 'http://purl.org/dc/terms/available': '', 'http://purl.org/dc/terms/bibliographicCitation': '', 'http://purl.org/dc/terms/conformsTo': '', 'http://purl.org/dc/terms/contributor': '', 'http://purl.org/dc/terms/coverage': '', 'http://purl.org/dc/terms/created': '', 'http://purl.org/dc/terms/creator': '', 'http://purl.org/dc/terms/date': '

## Save Rows to Variable

In [None]:
rows = dwca.rows

## Export Images to Google Drive

In [None]:
successIdxs = []
i = 0
while len(successIdxs) < NUMBER_TO_SCRAPE:
  url = rows[i].data['http://purl.org/dc/terms/references']
  if url != "":
    img_data = requests.get(url, stream = True)
    if img_data.status_code == 200:
      img_data.raw.decode_content = True
      fname = (str(i)+".jpg")
      with open(fname,'wb') as f:
        shutil.copyfileobj(img_data.raw, f)
      !mv $fname $OUTPUT_PATH/$fname
      successIdxs.append(i)
      print('Image sucessfully Downloaded: ',fname)
  else:
      print('No image in row: '+str(i))
  i+=1

Image sucessfully Downloaded:  0.jpg
No image in row: 1
No image in row: 2
Image sucessfully Downloaded:  3.jpg
No image in row: 4
No image in row: 5
Image sucessfully Downloaded:  6.jpg
No image in row: 7
Image sucessfully Downloaded:  8.jpg
No image in row: 9
No image in row: 10
No image in row: 11
Image sucessfully Downloaded:  12.jpg
No image in row: 13
Image sucessfully Downloaded:  14.jpg
Image sucessfully Downloaded:  15.jpg
No image in row: 16
No image in row: 17
No image in row: 18
No image in row: 19
No image in row: 20
Image sucessfully Downloaded:  21.jpg
No image in row: 22
Image sucessfully Downloaded:  23.jpg
Image sucessfully Downloaded:  24.jpg


## Iterate Through Location Data & Save to .txt

In [None]:
list_countries = [] 

for x in successIdxs:
    #print(rows[x])
    #print(rows[x].data)
    #print("Location Data: "+rows[x].data["http://rs.tdwg.org/dwc/terms/higherGeography"])
    list_countries.append(str("Location Data: "+rows[x].data["http://rs.tdwg.org/dwc/terms/higherGeography"]).split(";")[1])


"""
rows = dwca.rows
#print(rows[0])
#print(rows[0].data)
str("Location Data: "+rows[0].data["http://rs.tdwg.org/dwc/terms/higherGeography"]).split(";")[1]
"""

print(list_countries)

['China', 'China', 'China', 'China', 'China', 'China', 'China', 'China', 'China', 'China']


In [None]:
with open('listcountries.txt', 'w') as filehandle:
    for listitem in list_countries:
        filehandle.write('%s\n' % listitem)
    # !mv listcountries.txt $OUTPUT_PATH/_listcountries.txt


## Close the archive to free resources


In [None]:
dwca.close()

FileNotFoundError: ignored