# Global Register of Introduced and Invasive Species - Australia (GRIIS)
Import GRIIS list from [GBIF IPT](https://cloud.gbif.org/griis/resource?r=griis-australia)

In [139]:
#import essential libraries
import pandas as pd
import requests
import os
import zipfile
from pathlib import Path

Download the Darwin Core Archive and read the species list into a dataframe

In [140]:
url = "https://cloud.gbif.org/griis/archive.do?r=griis-australia&v=1.6"
basedir = "/Users/new330/IdeaProjects/authoritative-lists/"
sourcedir = basedir+"source-data/griis/"
targetdir = basedir+"current-lists/griis/"
print("Downloading dwca")
r = requests.get(url)

Downloading dwca


Retrieve the filename from the headers. Download and unzip the file

In [144]:
#get the filename from the headers and create a directory holder for it
basedir = "/Users/new330/IdeaProjects/authoritative-lists/"
sourcedir = basedir+"source-data/griis/"
targetdir = basedir+"current-lists/griis/"
dwca = sourcedir + r.headers['content-disposition'].split("=")[1].replace('"','')
if Path(dwca).exists():
  os.remove(dwca)
unzipdirname = dwca[:-len(".zip")]
if not Path(unzipdirname):
  os.mkdir(unzipdirname)
#downlaad
with open(dwca, "wb") as output_file:
    output_file.write(r.content)
# unzip
with zipfile.ZipFile(dwca, 'r') as z:
   z.extractall(unzipdirname)

Explore the list, looking especially for encoding issues.

In [146]:
#explore
taxondf = pd.read_csv(unzipdirname + os.sep + "taxon.txt",sep="\t",lineterminator="\n")
taxondf

Unnamed: 0,id,taxonID,scientificName,acceptedNameUsage,kingdom,phylum,class,order,family,taxonRank,taxonomicStatus
0,151000,151000,Abelia Ã—grandiflora (Rovelli ex AndrÃ©) Rehder,,Plantae,Tracheophyta,Magnoliopsida,Dipsacales,Caprifoliaceae,SPECIES,ACCEPTED
1,151001,151001,Abelmoschus manihot (L.) Medik.,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
2,151002,151002,Abutilon grandiflorum G.Don,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
3,151003,151003,Abutilon pictum (Gillies ex Hook.) Walp.,Callianthe picta (Gillies ex Hook. & Arn.) Don...,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,SYNONYM
4,151004,151004,Abutilon theophrasti Medik.,,Plantae,Tracheophyta,Magnoliopsida,Malvales,Malvaceae,SPECIES,ACCEPTED
...,...,...,...,...,...,...,...,...,...,...,...
2979,153979,153979,Ziziphus jujuba Mill.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2980,153980,153980,Ziziphus mauritiana Lam.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2981,153981,153981,Ziziphus mucronata Willd.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED
2982,153982,153982,Ziziphus spina-christi (L.) Desf.,,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,SPECIES,ACCEPTED


Problem records found by eyeballing. The file specifies UTF-8 encoding but has been encoded in something else.

In [147]:
taxondf.loc[taxondf['taxonID'].isin([151000,151155,151156,151617,151634,152066,153106])][['taxonID','scientificName']]

Unnamed: 0,taxonID,scientificName
0,151000,Abelia Ã—grandiflora (Rovelli ex AndrÃ©) Rehder
155,151155,"Amphilophus citrinellus (GÃ¼nther, 1864)"
156,151156,"Amphilophus labiatus (GÃ¼nther, 1864)"
617,151617,"Cichlasoma trimaculatum (GÃ¼nther, 1867)"
634,151634,Cladophora prolifera (Roth) KÃ¼tz.
1066,152066,Fallopia convolvulus (L.) Ã.LÃ¶ve
2106,153106,Pinus durangensis MartÃ­nez


These values should be:

151000 Abelia x grandiflora (Rovelli ex André) Rehder
151155 Amphilophus citrinellus (Günther, 1864)
151156 Amphilophus labiatus (Günther, 1864)
153106 Pinus durangensis Martínez

There are many more though, just by searching for a single character:

In [148]:
taxondf.loc[taxondf['scientificName'].str.contains('Ã')]

Unnamed: 0,id,taxonID,scientificName,acceptedNameUsage,kingdom,phylum,class,order,family,taxonRank,taxonomicStatus
0,151000,151000,Abelia Ã—grandiflora (Rovelli ex AndrÃ©) Rehder,,Plantae,Tracheophyta,Magnoliopsida,Dipsacales,Caprifoliaceae,SPECIES,ACCEPTED
104,151104,151104,Aloe parvibracteata SchÃ¶nland,Aloe monteiroi Baker,Plantae,Tracheophyta,Liliopsida,Asparagales,Asphodelaceae,SPECIES,SYNONYM
122,151122,151122,Alysicarpus ovalifolius (Schumach.) J.LÃ©onard,,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,SPECIES,ACCEPTED
148,151148,151148,"Amniataba percoides (GÃ¼nther, 1864)",,Animalia,Chordata,Actinopterygii,Perciformes,Terapontidae,SPECIES,ACCEPTED
155,151155,151155,"Amphilophus citrinellus (GÃ¼nther, 1864)",,Animalia,Chordata,Actinopterygii,Perciformes,Cichlidae,SPECIES,ACCEPTED
156,151156,151156,"Amphilophus labiatus (GÃ¼nther, 1864)",,Animalia,Chordata,Actinopterygii,Perciformes,Cichlidae,SPECIES,ACCEPTED
195,151195,151195,Antithamnion cruciatum (C.Agardh) NÃ¤geli,,Plantae,Rhodophyta,Florideophyceae,Ceramiales,Ceramiaceae,SPECIES,ACCEPTED
213,151213,151213,"Archocentrus nigrofasciatus (GÃ¼nther, 1867)","Amatitlania nigrofasciata (Günther, 1867)",Animalia,Chordata,Actinopterygii,Perciformes,Cichlidae,SPECIES,SYNONYM
249,151249,151249,"Ascidiella aspersa (MÃ¼ller, 1776)",,Animalia,Chordata,Ascidiacea,Phlebobranchia,Ascidiidae,SPECIES,ACCEPTED
265,151265,151265,"Astatotilapia burtoni (GÃ¼nther, 1894)",,Animalia,Chordata,Actinopterygii,Perciformes,Cichlidae,SPECIES,ACCEPTED


ftfy (fixed that for you) library works magic on these

In [149]:
import ftfy
from ftfy import fix_encoding

sample = "Abelia Ã—grandiflora (Rovelli ex AndrÃ©) Rehder"
fix_encoding(sample)

'Abelia ×grandiflora (Rovelli ex André) Rehder'

Apply to the `scientificName` and `acceptedNameUsage` fields

In [150]:
taxondf['scientificName'] = taxondf['scientificName'].apply(fix_encoding)
taxondf[['taxonID','scientificName','acceptedNameUsage']]

Unnamed: 0,taxonID,scientificName,acceptedNameUsage
0,151000,Abelia ×grandiflora (Rovelli ex André) Rehder,
1,151001,Abelmoschus manihot (L.) Medik.,
2,151002,Abutilon grandiflorum G.Don,
3,151003,Abutilon pictum (Gillies ex Hook.) Walp.,Callianthe picta (Gillies ex Hook. & Arn.) Don...
4,151004,Abutilon theophrasti Medik.,
...,...,...,...
2979,153979,Ziziphus jujuba Mill.,
2980,153980,Ziziphus mauritiana Lam.,
2981,153981,Ziziphus mucronata Willd.,
2982,153982,Ziziphus spina-christi (L.) Desf.,


Write the file and remove the original zip download.

In [151]:
taxondf.to_csv(targetdir+"griis-1.6.csv",index=False)
os.remove(dwca)