In [None]:
import csv

In [None]:
inputfiles = [
    'data/VBA_FAUNA25.csv',
    'data/VBA_FAUNA100.csv',
    'data/VBA_FLORA25.csv',
    'data/VBA_FLORA100.csv'
]

outputfilename = 'data/dwc/occurrences.csv'

In [None]:
dwc_terms = {
    "license": "http://purl.org/dc/terms/license",
    "basisOfRecord": "http://rs.tdwg.org/dwc/terms/basisOfRecord",
    "type": "http://purl.org/dc/terms/type",
    "catalogNumber": "http://rs.tdwg.org/dwc/terms/catalogNumber",
    "recordedBy": "http://rs.tdwg.org/dwc/terms/recordedBy",
    "organismQuantity": "http://rs.tdwg.org/dwc/terms/organismQuantity",
    "organismQuantityType": "http://rs.tdwg.org/dwc/terms/organismQuantityType",
    "individualCount": "http://rs.tdwg.org/dwc/terms/individualCount",
    "occurrenceStatus": "http://rs.tdwg.org/dwc/terms/occurrenceStatus",
    "establishmentMeans": "http://rs.tdwg.org/dwc/terms/establishmentMeans",
    "occurrenceRemarks": "http://rs.tdwg.org/dwc/terms/occurrenceRemarks",
    "eventID": "http://rs.tdwg.org/dwc/terms/eventID",
    "eventDate": "http://rs.tdwg.org/dwc/terms/eventDate",
    "year": "http://rs.tdwg.org/dwc/terms/year",
    "month": "http://rs.tdwg.org/dwc/terms/month",
    "samplingProtocol": "http://rs.tdwg.org/dwc/terms/samplingProtocol",
    "locationID": "http://rs.tdwg.org/dwc/terms/locationID",
    "country": "http://rs.tdwg.org/dwc/terms/country",
    "countryCode": "http://rs.tdwg.org/dwc/terms/countryCode",
    "stateProvince": "http://rs.tdwg.org/dwc/terms/stateProvince",
    "locality": "http://rs.tdwg.org/dwc/terms/locality",
    "minimumElevationInMeters": "http://rs.tdwg.org/dwc/terms/minimumElevationInMeters",
    "maximumElevationInMeters": "http://rs.tdwg.org/dwc/terms/maximumElevationInMeters",
    "decimalLatitude": "http://rs.tdwg.org/dwc/terms/decimalLatitude",
    "decimalLongitude": "http://rs.tdwg.org/dwc/terms/decimalLongitude",
    "geodeticDatum": "http://rs.tdwg.org/dwc/terms/geodeticDatum",
    "coordinateUncertaintyInMeters": "http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters",
    "scientificName": "http://rs.tdwg.org/dwc/terms/scientificName",
    "scientificNameAuthorship": "http://rs.tdwg.org/dwc/terms/scientificNameAuthorship",
    "nameAccordingTo": "http://rs.tdwg.org/dwc/terms/nameAccordingTo",
    "vernacularName": "http://rs.tdwg.org/dwc/terms/vernacularName",
    "identificationVerificationStatus": "http://rs.tdwg.org/dwc/terms/identificationVerificationStatus",
    "site_accuracy_km": "site_accuracy_km",
    "FIRE_RESP": "FIRE_RESP",
    "VIC_LF": "VIC_LF",
    "NVIS_GF": "NVIS_GF",
    "ECOLOGY": "ECOLOGY",
    "TAXON_TYPE": "TAXON_TYPE",
    "TREATY": "TREATY",
    "LOW_COUNT": "LOW_COUNT",
    "HIGH_COUNT": "HIGH_COUNT",
    "COUNT_ACC": "COUNT_ACC",
    "dataset_version_date": "dataset_version_date"
}

In [None]:
def map_to_dwc(row):
    dwc = {
        'id': int(row['RECORD_ID']),
        'license': 'https://creativecommons.org/licenses/by/4.0/legalcode',
        'basisOfRecord': 'PreservedSpecimen' if row['SURVEYTYPE'] == 'Specimen' else 'HumanObservation',
        'type': 'PhysicalObject' if row['SURVEYTYPE'] == 'Specimen' else 'Event',

        # Occurrence
        'catalogNumber': int(row['RECORD_ID']),
        'recordedBy': row['COLLECTOR'],
        'individualCount': row['TOTALCOUNT'],
        'occurrenceStatus': 'present',
        'establishmentMeans': row['ORIGIN'].split()[0] if row['ORIGIN'] else "",

        # Event
        'eventID': row['SURVEY_ID'],
        'eventDate': row['STARTDATE'][0:4] + '-' + row['STARTDATE'][4:6] + '-' + row['STARTDATE'][6:8] if row['STARTDATE'] else "",
        'year': row['START_YEAR'],
        'month': row['START_MTH'],
        'samplingProtocol': row['SURVEYTYPE'] if row['SURVEYTYPE'] != 'Specimen' else "",

        # Location
        'locationID': row['SITE_ID'],
        'country': "Australia",
        "countryCode": "AU",
        'stateProvince': "Victoria",
        'locality': row['LOCN_DESC'],
        'minimumElevationInMeters': row['ALTITUDE'],
        'maximumElevationInMeters': row['ALTITUDE'],
        'decimalLatitude': row['LAT_DD94'],
        'decimalLongitude': row['LONG_DD94'],
        'geodeticDatum': "epsg:4283",
        'coordinateUncertaintyInMeters': float(row['MAX_ACC_KM'])*1000,

        # Identification
        'scientificName': row['SCI_NAME'],
        'vernacularName': row['COMM_NAME'],
        'identificationVerificationStatus': row['RELIABILITY'],

        # Misc.
        'site_accuracy_km': row['MAX_ACC_KM'],
        'ECOLOGY': row['ECOLOGY'],
        'dataset_version_date': row['VERS_DATE']
    }

    if 'AUTHORITY' in row:
        dwc['scientificNameAuthorship'] = row['AUTHORITY'] if ~row['AUTHORITY'].startswith('sensu') else ""
        dwc['nameAccordingTo'] = row['AUTHORITY'][6:] if row['AUTHORITY'].startswith('sensu') else ""

    if 'COVERABUND' in row:
        dwc['organismQuantity'] = row['COVERABUND'].split()[0] if 'Dom' in row['COVERABUND'] else (row['COVERABUND'] if row['COVERABUND'] != 'Withheld' else "")
        dwc['organismQuantityType'] = "dominScale" if 'Dom' in row['COVERABUND'] else ("braunBlanquetScale" if row['COVERABUND'] != 'Withheld' else "")

    if 'ENDDATE' in row and row['ENDDATE']:
        dwc['eventDate'] = dwc['eventDate'] + "/" + row['ENDDATE'][0:4] + '-' + row['ENDDATE'][4:6] + '-' + row['ENDDATE'][6:8]

    if 'FIRE_RESP' in row:
        dwc['FIRE_RESP'] = row['FIRE_RESP']

    if 'VIC_LF' in row:
        dwc['VIC_LF'] = row['VIC_LF']

    if 'NVIS_LF' in row:
        dwc['NVIS_LF'] = row['NVIS_LF']

    if 'TAXON_TYPE' in row:
        dwc['TAXON_TYPE'] = row['TAXON_TYPE']
    
    if 'TREATY' in row:
        dwc['TREATY'] = row['TREATY']

    if 'LOW_COUNT' in row:
        dwc['LOW_COUNT'] = row['LOW_COUNT']
    
    if 'HIGH_COUNT' in row:
        dwc['HIGH_COUNT'] = row['HIGH_COUNT']
    
    if 'COUNT_ACC' in row:
        dwc['COUNT_ACC'] = row['COUNT_ACC']

    if 'EXTRA_INFO' in row:
        dwc['occurrenceRemarks'] = row['EXTRA_INFO']

    return dwc

In [None]:
outputfile = open(outputfilename, 'w', newline='')
fieldnames = list(dwc_terms.keys())
fieldnames.insert(0, 'id')
writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
writer.writeheader()

In [None]:
for f in inputfiles:
    with open(f, newline='') as inputfile:
        reader = csv.DictReader(inputfile, delimiter='\t')
        for row in reader:
            writer.writerow(map_to_dwc(row))
        

In [None]:
outputfile.close()

In [None]:
filename = outputfilename

def _make_gen(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024*1024)

def rawgencount(filename):
    f = open(filename, 'rb')
    f_gen = _make_gen(f.raw.read)
    return sum( buf.count(b'\n') for buf in f_gen )

print(rawgencount(filename))