# iNat Statuses 

Prereq:
1. Run collate-status-taxa.ipynb which generates `inat-aust-status-taxa.csv` (Australian statuses joined to taxa names)

This notebook:
1. Setup & nominate the state
2. Download sensitive and conservation lists 
    a. merge them into a single statelist
    b. match to the inat taxonomy (left join)
3. Filter `inat-aust-status-taxa.csv` for this state
4. Merge the state and inat lists, outer join on inat_taxon_id. Apply these rules to set up the action:
        UPDATE: inat status and state status both exist
        REMOVE: inat status exists, state status does not
        ADD: state status exists, inat status does not (on a matching taxon)
        NO MATCH: state status exists, inat taxa not found


## 1. Set up and nominate the state

In [8]:
import pandas as pd
import sys
import os
projectdir = os.path.dirname(os.getcwd()) + "/" # parent dir of cwd
sourcedir = projectdir + "data/in/"
sys.path.append(os.path.abspath(projectdir + "notebooks/includes/"))
import list_functions  as lf

state = "wa"

if (state == "qld"):
    stateregex = "Qld|QLD|Queensland|QUEENSLAND|QL"
    stateurlregex = ".qld."
    sensitivelistuid = "dr493"
    conservationlistuid = "dr652"
    conservationauthority = "Queensland Nature Conservation Act 1992"
    sensitiveauthority = "Qld Department of Environment and Science"
elif (state == "nsw"):
    stateregex = "NSW|New South Wales"
    stateurlregex = ".nsw."
    sensitivelistuid = "dr487"
    conservationlistuid = "dr650"
    conservationauthority = "Biodiversity Conservation Act 2016"
    sensitiveauthority = "New South Wales Office of Environment and Heritage"
elif (state == "act"):
    stateregex = "ACT Government|Australian Capital Territory| ACT, AU"
    stateurlregex = ".act.gov"
    sensitivelistuid = "dr2627"
    conservationlistuid = "dr649" 
    conservationauthority = "Nature Conservation Act 2014 (ACT)"
    sensitiveauthority = "ACT Government"
elif (state == "nt"):
    stateregex = "Northern Territory|NT NRETAS"
    stateurlregex = " "
    sensitivelistuid = "dr492"
    conservationlistuid = "dr651"
    conservationauthority = "Territory Parks and Wildlife Conservation Act"
    sensitiveauthority = "Northern Territory Department of Environment and Natural Resources"
elif (state == "sa"):
    stateregex = " SA |South Australia|SOUTH AUSTRALIA"
    stateurlregex = "sa.gov.au"
    sensitivelistuid = "dr884"
    conservationlistuid = "dr653"
    conservationauthority = ""
    sensitiveauthority = ""
elif (state == "tas"):
    stateregex = "Tasmania|TAS"
    stateurlregex = ".tas.gov"
    sensitivelistuid = "dr491"
    conservationlistuid = "dr654"
    conservationauthority = ""
    sensitiveauthority = ""
elif (state == "vic"):
    stateregex = " VIC |Victoria|VICTORIA|Vic"
    stateurlregex = "vic.gov.au"
    sensitivelistuid = "dr490"
    conservationlistuid = "dr655"
    conservationauthority = "Victorian Department of Energy, Environment and Climate Action"
    sensitiveauthority = "Victorian Biodiversity Atlas"
elif (state == "wa"):
    stateregex = " WA |WEST AUST|West Aust|WESTERN AUSTRALIA|Western Australia"
    stateurlregex = ".wa.gov.au"
    sensitivelistuid = "dr467"
    conservationlistuid = "dr2201"
    conservationauthority = "WA Deparment of Biodiversity, Conservation and Attractions"
    sensitiveauthority = "WA Deparment of Biodiversity, Conservation and Attractions"

# functions
def filter_inat_statuses(stateregex: str, urlregex: str):
    taxastatus = pd.read_csv(sourcedir + "inat-aust-status-taxa.csv", encoding='UTF-8',na_filter=False,dtype=str)
    authoritydf = taxastatus['authority'].drop_duplicates().sort_values()
    authoritydf = authoritydf[pd.Series(authoritydf).str.contains(stateregex)]
    urldf = taxastatus['url'].drop_duplicates().sort_values()
    urldf = urldf[pd.Series(urldf).str.contains(urlregex)]
    placedisplaydf = taxastatus['place_display_name'].drop_duplicates().sort_values()
    placedisplaydf = placedisplaydf[pd.Series(placedisplaydf).str.contains(stateregex)]
    placedf = taxastatus['place_name'].drop_duplicates().sort_values()
    placedf = placedf[pd.Series(placedf).str.contains(stateregex)]
    # concat all and remove duplicates
    statedf = pd.concat([taxastatus.apply(lambda row: row[taxastatus['place_display_name'].isin(placedisplaydf)]),
                         taxastatus.apply(lambda row: row[taxastatus['place_name'].isin(placedf)]),
                         taxastatus.apply(lambda row: row[taxastatus['url'].isin(urldf)]),
                         taxastatus.apply(
                             lambda row: row[taxastatus['authority'].isin(authoritydf)])]).drop_duplicates()
    # map the iucn statuses
    iucn_map_num = {
        '10':'Least Concern',
        '20':'Near Threatened',
        '30':'Vulnerable',
        '40':'Endangered',
        '50':'Critically Endangered',
        '70':'Extinct'
    }
    icun_map_status = {
        'Least concern':'10',
        'Special least concern':'10',
        'Critically Endangered':'50',
        'Endangered':'40',
        'Vulnerable':'30',
        'Extinct':'70',
        'Extinct in the wild':'70',
        'Near Threatened':'20'
    }
    statedf['iucn_equiv'] = statedf['iucn'].map(iucn_map_num).fillna('na')
    return statedf.sort_values(['taxon_id', 'user_id']).add_prefix("inatstatus_")

def download_ala_lists(state: str, conservationlistuid: str, sensitivelistuid: str):
    sensitivelist = lf.download_ala_specieslist("https://lists.ala.org.au/ws/speciesListItems/" + sensitivelistuid + "?max=10000&includeKVP=true")
    sensitivelist = lf.kvp_to_columns(sensitivelist)
    sensitivelist.to_csv(sourcedir + "/state-lists/" + state + "-ala-sensitive.csv", index=False)
    
    conservationlist = lf.download_ala_specieslist("https://lists.ala.org.au/ws/speciesListItems/" + conservationlistuid + "?max=10000&includeKVP=true")
    conservationlist = lf.kvp_to_columns(conservationlist)
    conservationlist.to_csv(sourcedir + "/state-lists/" + state + "-ala-conservation.csv", index=False)


## 2. Download the lists from ALA 

In [9]:
# %%script echo skipping # comment this line to download dataset from lists.ala.org.au the web and save locally
download_ala_lists(state,conservationlistuid,sensitivelistuid)

In [10]:
# A. Read lists and merge
conservationlist = pd.read_csv(sourcedir + "/state-lists/" + state + "-ala-conservation.csv", dtype=str)
sensitivelist = pd.read_csv(sourcedir + "/state-lists/" + state + "-ala-sensitive.csv", dtype=str)

# add whether obscured|private to the sensitive list
# manage nuance between lists

sensitivemapping = {
    "100km":"private",
    "WITHHOLD":"private"
}
sensitivelist['geoprivacy'] = sensitivelist['generalisation'].str.strip().map(sensitivemapping).fillna("obscured")

if (state == "sa"):
    conservationlist.rename(columns = {'IUCN_Equivalent_Status':'IUCN_equivalent_status'},inplace=True)

statelist = sensitivelist.merge(conservationlist,how="outer",on="name")
statelist['geoprivacy'] = statelist['geoprivacy'].fillna('open')
statelist['status'] = statelist['status'].fillna('Sensitive') # no conservation status  
statelist['IUCN_equivalent_status'] = statelist['IUCN_equivalent_status'].fillna('Vulnerable')
statelist = statelist[['name','geoprivacy','status','IUCN_equivalent_status']].add_prefix("state_")
statelist.groupby(['state_status','state_geoprivacy'])['state_name'].size()

state_status                       state_geoprivacy
Conservation Dependent             obscured               7
Critically Endangered              obscured             233
Endangered                         obscured             210
Extinct                            obscured              38
Migratory                          obscured              94
Other Specially Protected          obscured               4
Priority 1: Poorly-known species   obscured            1204
Priority 2: Poorly-known species   obscured             997
Priority 3: Poorly-known species   obscured            1085
                                   open                   1
Priority 4: Rare, Near Threatened  obscured             425
Sensitive                          obscured               1
Vulnerable                         obscured             251
Name: state_name, dtype: int64

In [11]:
# B. # get the inaturalist taxonomy matches for additions 
inattaxa = pd.read_csv(sourcedir + "inaturalist-australia-9/inaturalist-australia-9-taxa.csv",dtype=str,usecols=['id','name','rank','observations_count','is_active'])
inattaxa = inattaxa[inattaxa['is_active'] == 't']
inattaxa = inattaxa.rename(columns = {'id':'taxon_id','name':'taxon_name'})
inattaxa = inattaxa.add_prefix("inattaxa_")
statelist = statelist.merge(inattaxa,how="left",left_on='state_name',right_on='inattaxa_taxon_name')
statelist

Unnamed: 0,state_name,state_geoprivacy,state_status,state_IUCN_equivalent_status,inattaxa_taxon_id,inattaxa_taxon_name,inattaxa_rank,inattaxa_observations_count,inattaxa_is_active
0,Acacia anomala,obscured,Vulnerable,Vulnerable,898615,Acacia anomala,species,0,t
1,Acacia aphylla,obscured,Vulnerable,Vulnerable,139888,Acacia aphylla,species,39,t
2,Acacia auratiflora,obscured,Vulnerable,Vulnerable,139905,Acacia auratiflora,species,0,t
3,Acacia brachypoda,obscured,Vulnerable,Vulnerable,139909,Acacia brachypoda,species,0,t
4,Acacia caesariata,obscured,Vulnerable,Vulnerable,1252559,Acacia caesariata,species,0,t
...,...,...,...,...,...,...,...,...,...
4548,Baeckea sp. Crossroads (B.L.Rye & M.E.Trudgen ...,obscured,Priority 2: Poorly-known species,Vulnerable,,,,,
4549,Baeckea sp. Forrestania (K.R.Newbey 1105) WA H...,obscured,Priority 2: Poorly-known species,Vulnerable,,,,,
4550,Pterostylis sp. Paynes Find (G.Brockman GBB 526),obscured,Priority 3: Poorly-known species,Vulnerable,,,,,
4551,Pterostylis sp. scooped sepals (G.Brockman GBB...,obscured,Priority 3: Poorly-known species,Vulnerable,,,,,


## 3. Filter iNat statuses for this state

In [12]:
inatstatuses = filter_inat_statuses(stateregex, stateurlregex)
inatstatuses.groupby(['inatstatus_status']).size()

inatstatus_status
Biosecurity Significant                1
CR                                     1
Conservation Dependent                 5
Critically Endangered                161
EN                                     6
EX                                     1
Endangered                           161
Extinct                               25
Migratory                             93
NT                                     3
Not listed                             1
Other Specially Protected              4
P1                                    14
P2                                    30
P3                                    35
P4                                    10
Potentially sensitive locations        1
Priority 1                             2
Priority 1: Poorly-known species     539
Priority 2                             1
Priority 2: Poorly-known species     576
Priority 3: Poorly-known species     712
Priority 4: Rare, Near Threatened    334
Priority Three                         

4. Merge the state and inat lists, outer join on inat_taxon_id. Apply these rules to set up the action:
        UPDATE: inat status and state status both exist
        REMOVE: inat status exists, state status does not
        ADD: state status exists, inat status does not (on a matching taxon)
        NO MATCH: state status exists, inat taxa not found

In [13]:
mergedstatuses = statelist.merge(inatstatuses,how="outer",left_on='state_name',right_on='inatstatus_scientificName')

# UPDATE: inat status and state status both exist
# REMOVE: inat status exists, state status does not
# ADD: state status exists, inat status does not (matching taxon)
# NO MATCH: state status exists, inat taxa not found

mergedstatuses['action'] = 'na'
mergedstatuses.loc[mergedstatuses['inatstatus_id'].notnull() & mergedstatuses['state_name'].notnull(), 'action'] = "UPDATE"
mergedstatuses.loc[mergedstatuses['inatstatus_id'].isnull() & mergedstatuses['inattaxa_taxon_id'].notnull(), 'action'] = "ADD"
mergedstatuses.loc[mergedstatuses['inatstatus_id'].notnull() & mergedstatuses['state_name'].isnull(), 'action'] = "NOT ON STATE LIST"
mergedstatuses.loc[mergedstatuses['inatstatus_id'].isnull() & mergedstatuses['inattaxa_taxon_id'].isnull(), 'action'] = "NO INAT TAXA MATCH"

#mergedstatuses['inat_taxa_name'] = mergedstatuses.apply(lambda x: x['inattaxa_taxon_name'] if (pd.notnull(['inattaxa_taxon_name'])) else x['inatstatus_scientificName'],axis=1)
mergedstatuses['inat_taxa_name'] = mergedstatuses['inatstatus_scientificName'].fillna(mergedstatuses['inattaxa_taxon_name'])

# only update those with different values 
mergedstatuses['action'] = mergedstatuses.apply(lambda x: "NO CHANGE" if (x['action'] == "UPDATE") & ((x['state_status'] == x['inatstatus_status']) & (x['state_geoprivacy'] == x['inatstatus_geoprivacy'] ) & (x['state_IUCN_equivalent_status'] == x['inatstatus_iucn_equiv'])) else x['action'], axis=1)

mergedstatuses['inat_link'] = mergedstatuses.apply(lambda x: "https://inaturalist.ala.org.au/conservation_statuses/" + str(x['inatstatus_id']) + "/edit" if (x['action'] == "UPDATE") | (x['action'] == "NOT ON STATE LIST") else ( "https://inaturalist.ala.org.au/taxa/" + str(x['inattaxa_taxon_id']) + "/conservation_statuses/new" if(x['action'] == "ADD") else "https://inaturalist.ala.org.au/taxa/" + str(x['inattaxa_taxon_id'])), axis=1)

# display
mergedstatusesprintfriendly = mergedstatuses[['action','inatstatus_id','inattaxa_taxon_id','state_name','inat_taxa_name', 'state_status','inatstatus_status','state_geoprivacy','inatstatus_geoprivacy','state_IUCN_equivalent_status','inatstatus_iucn_equiv','inatstatus_description','inatstatus_references','inatstatus_place_display_name','inatstatus_current_synonymous_taxon_ids','inat_link']]
# 'state_authority','inat_authority','state_url','inat_url',
mergedstatusesprintfriendly.to_csv(projectdir + "data/out/summaries/" + state + ".csv",index=False)
mergedstatuses.groupby('action').size()

action
ADD                    273
NO CHANGE              278
NO INAT TAXA MATCH    1782
NOT ON STATE LIST      452
UPDATE                2301
dtype: int64