# Global Register of Introduced and Invasive Species - Australia (GRIIS)
Import GRIIS list from [GBIF IPT](https://cloud.gbif.org/griis/resource?r=griis-australia)

In [4]:
#import essential libraries
import pandas as pd
import requests
import os
import zipfile
from pathlib import Path

Download the Darwin Core Archive and read the species list into a dataframe

In [14]:
url = "https://cloud.gbif.org/griis/archive.do?r=griis-australia&v=1.9"
basedir = "/Users/oco115/PycharmProjects/authoritative-lists/"
sourcedir = basedir+"source-data/griis/"
targetdir = basedir+"current-lists/griis/"
print("Downloading dwca")
r = requests.get(url)

Downloading dwca


Retrieve the filename from the headers. Download and unzip the file

In [15]:
#get the filename from the headers and create a directory holder for it
basedir = "/Users/oco115/PycharmProjects/authoritative-lists/"
sourcedir = basedir+"source-data/griis/"
targetdir = basedir+"current-lists/griis/"
dwca = sourcedir + r.headers['content-disposition'].split("=")[1].replace('"','')
if Path(dwca).exists():
  os.remove(dwca)
unzipdirname = dwca[:-len(".zip")]
if not Path(unzipdirname):
  os.mkdir(unzipdirname)
#downlaad
with open(dwca, "wb") as output_file:
    output_file.write(r.content)
# unzip
with zipfile.ZipFile(dwca, 'r') as z:
   z.extractall(unzipdirname)

Explore the list, looking especially for encoding issues.

In [16]:
#explore
taxondf = pd.read_csv(unzipdirname + os.sep + "taxon.txt",sep="\t",lineterminator="\n")
taxondf

Unnamed: 0,id,taxonID,scientificName,acceptedNameUsage,kingdom,phylum,class,order,family,taxonRank,taxonomicStatus
0,151000,151000,Abelia ×grandiflora (Rovelli ex André) Rehder,,Plantae,Tracheophyta,Magnoliopsida,Dipsacales,Caprifoliaceae,SPECIES,ACCEPTED
1,151001,151001,Abelmoschus manihot (L.) Medik.,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
2,151002,151002,Abutilon grandiflorum G.Don,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
3,151003,151003,Abutilon pictum (Gillies ex Hook.) Walp.,Callianthe picta (Gillies ex Hook. & Arn.) Don...,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,SYNONYM
4,151004,151004,Abutilon theophrasti Medik.,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
...,...,...,...,...,...,...,...,...,...,...,...
2947,153979,153979,Ziziphus jujuba Mill.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2948,153980,153980,Ziziphus mauritiana Lam.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2949,153981,153981,Ziziphus mucronata Willd.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2950,153982,153982,Ziziphus spina-christi (L.) Desf.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED


Problem records found by eyeballing. The file specifies UTF-8 encoding but has been encoded in something else.

In [17]:
taxondf.loc[taxondf['taxonID'].isin([151000,151155,151156,151617,151634,152066,153106])][['taxonID','scientificName']]

Unnamed: 0,taxonID,scientificName
0,151000,Abelia ×grandiflora (Rovelli ex André) Rehder
154,151155,"Amphilophus citrinellus (Günther, 1864)"
155,151156,"Amphilophus labiatus (Günther, 1864)"
612,151617,"Cichlasoma trimaculatum (Günther, 1867)"
629,151634,Cladophora prolifera (Roth) Kütz.
1059,152066,Fallopia convolvulus (L.) Á.Löve
2080,153106,Pinus durangensis Martínez


These values should be:

151000 Abelia x grandiflora (Rovelli ex André) Rehder
151155 Amphilophus citrinellus (Günther, 1864)
151156 Amphilophus labiatus (Günther, 1864)
153106 Pinus durangensis Martínez

There are many more though, just by searching for a single character:

In [18]:
taxondf.loc[taxondf['scientificName'].str.contains('Ã')]

Unnamed: 0,id,taxonID,scientificName,acceptedNameUsage,kingdom,phylum,class,order,family,taxonRank,taxonomicStatus
855,151862,151862,Dianella ensifolia (L.) RedoutÃ©,,Plantae,Tracheophyta,Liliopsida,Asparagales,Asphodelaceae,SPECIES,ACCEPTED
891,151898,151898,"Dipolydora flava (ClaparÃ¨de, 1870)",,Animalia,Annelida,Polychaeta,Spionida,Spionidae,SPECIES,ACCEPTED
1636,152656,152656,Manihot carthaginensis MÃ¼ll.Arg.,,Plantae,Tracheophyta,Magnoliopsida,Malpighiales,Euphorbiaceae,SPECIES,ACCEPTED
2253,153281,153281,Quercus palustris MÃ¼nchh.,,Plantae,Tracheophyta,Magnoliopsida,Fagales,Fagaceae,SPECIES,ACCEPTED
2444,153472,153472,Schoenoplectus californicus (C.A.Mey.) SojÃ¡k,,Plantae,Tracheophyta,Liliopsida,Poales,Cyperaceae,SPECIES,ACCEPTED


ftfy (fixed that for you) library works magic on these

In [19]:
import ftfy
from ftfy import fix_encoding
sample = "Abelia Ã—grandiflora (Rovelli ex AndrÃ©) Rehder"
fix_encoding(sample)

'Abelia ×grandiflora (Rovelli ex André) Rehder'

Apply to the `scientificName` and `acceptedNameUsage` fields

In [22]:
taxondf['scientificName'] = taxondf['scientificName'].apply(fix_encoding)
taxondf[['taxonID','scientificName','acceptedNameUsage']]

Unnamed: 0,taxonID,scientificName,acceptedNameUsage
0,151000,Abelia ×grandiflora (Rovelli ex André) Rehder,
1,151001,Abelmoschus manihot (L.) Medik.,
2,151002,Abutilon grandiflorum G.Don,
3,151003,Abutilon pictum (Gillies ex Hook.) Walp.,Callianthe picta (Gillies ex Hook. & Arn.) Don...
4,151004,Abutilon theophrasti Medik.,
...,...,...,...
2947,153979,Ziziphus jujuba Mill.,
2948,153980,Ziziphus mauritiana Lam.,
2949,153981,Ziziphus mucronata Willd.,
2950,153982,Ziziphus spina-christi (L.) Desf.,


Write the file and remove the original zip download.

In [24]:
taxondf.to_csv(targetdir+"griis-1.9.csv",index=False)
# os.remove(dwca)