In [1]:
import pandas as pd
import numpy as np
from collections import Counter, OrderedDict

import geopandas
from geopandas.tools import sjoin

In short, Peromyscus maniculatus was split into 5 names circa 2019 making it a good MSW test case for tax splitting


summerize stats: 
    mammels: with, without coords
        Peromyscus with, without coords
            maniculatus with, without coords


and overview of interesting fields & their proportion of presence
lookup the dynamic properties for additional metrics (i.e., measurements)
probabilistic approach wrt morphometrics (if measurements are present)

#### Aggregated data based on GBIF queries made 6/4/21

- total  Mammalia, in North America: 3,479,615
- total  Mammalia, in North America, with coordinates: 2,179,006
- total  Mammalia, in North America, with coordinates, basisOfRecord as preservedSpecimen: 1,769,317


- total  Rodentia, in North America: 1,864,658
- total  Rodentia, in North America, with coordinates: 1,256,561
- total  Rodentia, in North America, with coordinates, basisOfRecord as preservedSpecimen: 1,191,433


- total Peromyscus Gloger, 1841, in North America: 553,867
- total Peromyscus Gloger, 1841, in North America, with coordinates: 401,466
- total Peromyscus Gloger, 1841, in North America, with coordinates, basisOfRecord as preservedSpecimen: 398,102


- total Peromyscus maniculatus (Wagner, 1845), in North America: 242,663
- total Peromyscus maniculatus (Wagner, 1845), in North America, with coordinates: 184,629
- total Peromyscus maniculatus (Wagner, 1845), in North America, with coordinates, basisOfRecord as preservedSpecimen: 184,307

#### It appears GBIF does not recognize the split

- Peromyscus gambelii is not recognized by GBIF: 0
- Peromyscus labecula Elliot, 1903 is recoognized by GBIF as = "Peromyscus maniculatus (Wagner, 1845)": 0
- Peromyscus sonoriensis blandus Osgood, 1904 is recognized by GBIF as = "Peromyscus maniculatus (Wagner, 1845)": 0
- Peromyscus sonoriensis fulvus Osgood, 1904 is recognized by GBIF as = "Peromyscus maniculatus (Wagner, 1845)": 0
- Peromyscus arcticus is not recognized by GBIF: 0

#### Assess the frequency of locality data among non-geocoded records
GBIF source citation:

GBIF.org (14 June 2021) GBIF Occurrence Download https://doi.org/10.15468/dl.2k6erc 

In [2]:
occ = pd.read_csv("gbif_occurrences/0301840-200613084148143/occurrence.txt", sep='\t',
                  header=0,
                  low_memory=False)

occ.replace("", np.nan, inplace=True)

print("total occ records: ", len(occ))
print("pct occ records with non null locality: ", len(occ[occ['locality'].notnull()]) / len(occ))
print("pct occ records with neither null locality or stateProvince: ", len(occ[(occ['locality'].notnull()) | (occ['stateProvince'].notnull())]) / len(occ))
print("     n=", len(occ[(occ['locality'].notnull()) | (occ['stateProvince'].notnull())]))

without_geo = occ[(occ['decimalLatitude'].isnull() | occ['decimalLongitude'].isnull()) &
                  (occ['locality'].isnull() | occ['stateProvince'].isnull())]
print("pct occ records missing both georeferences and locality", len(without_geo)/len(occ))
print("    n=", len(without_geo))

del occ

total occ records:  268219
pct occ records with non null locality:  0.8102371569501042
pct occ records with neither null locality or stateProvince:  0.9965289558159564
     n= 267288
pct occ records missing both georeferences and locality 0.02046834862556344
    n= 5490


#### evaluate frequency of various fields
GBIF source citation: 

GBIF.org (04 June 2021) GBIF Occurrence Download https://doi.org/10.15468/dl.phjg43 

Unless GBIF discovers citations of this download, the data file is eligible for deletion after June 4, 2022.

In [3]:
occ = pd.read_csv("gbif_occurrences/0294669-200613084148143/occurrence.txt", sep='\t', header=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


#### Evaluate data availability

In [4]:
col_summery = occ.replace('', np.nan).count().to_frame(name="total_frequency")
col_summery['col_name'] = col_summery.index
pct_field_name = f'pct_of_records (n={len(occ)})'
col_summery[pct_field_name] = col_summery['total_frequency'] / len(occ)
col_summery

col_summery = col_summery.sort_values(by=[pct_field_name], ascending=False)[['col_name', 'total_frequency', pct_field_name]]

col_summery.to_csv("data_availability.csv", index=False)

display(col_summery)

Unnamed: 0,col_name,total_frequency,pct_of_records (n=184306)
gbifID,gbifID,184306,1.0
hasCoordinate,hasCoordinate,184306,1.0
institutionCode,institutionCode,184306,1.0
datasetKey,datasetKey,184306,1.0
taxonomicStatus,taxonomicStatus,184306,1.0
...,...,...,...
earliestAgeOrLowestStage,earliestAgeOrLowestStage,0,0.0
latestEpochOrHighestSeries,latestEpochOrHighestSeries,0,0.0
organismName,organismName,0,0.0
latestPeriodOrHighestSystem,latestPeriodOrHighestSystem,0,0.0


In [5]:
# isolate column names relevant to naming
taxon_relevent_terms = ['taxon', 'name', 'sci', 'citation', 'reference', 'key']
taxon_col_names = []
for col_name in occ.columns.tolist():
    for intersting_term in taxon_relevent_terms:
        if intersting_term in col_name.lower():
            taxon_col_names.append(col_name)

taxon_col_names = sorted(list(set(taxon_col_names)))
taxon_col_names

['acceptedNameUsage',
 'acceptedNameUsageID',
 'acceptedScientificName',
 'acceptedTaxonKey',
 'associatedReferences',
 'bibliographicCitation',
 'classKey',
 'datasetKey',
 'datasetName',
 'familyKey',
 'genericName',
 'genusKey',
 'georeferenceProtocol',
 'georeferenceRemarks',
 'georeferenceSources',
 'georeferenceVerificationStatus',
 'georeferencedBy',
 'georeferencedDate',
 'identificationReferences',
 'isReferencedBy',
 'kingdomKey',
 'level0Name',
 'level1Name',
 'level2Name',
 'level3Name',
 'nameAccordingTo',
 'nameAccordingToID',
 'namePublishedIn',
 'namePublishedInID',
 'namePublishedInYear',
 'orderKey',
 'organismName',
 'originalNameUsage',
 'originalNameUsageID',
 'parentNameUsage',
 'parentNameUsageID',
 'phylumKey',
 'references',
 'scientificName',
 'scientificNameID',
 'speciesKey',
 'subgenusKey',
 'taxonConceptID',
 'taxonID',
 'taxonKey',
 'taxonRank',
 'taxonRemarks',
 'taxonomicStatus',
 'typifiedName',
 'verbatimScientificName',
 'verbatimTaxonRank',
 'vernac

In [6]:
occ['acceptedScientificName'].unique()

array(['Peromyscus maniculatus (Wagner, 1845)'], dtype=object)

In [7]:
occ['scientificName'].unique()

array(['Peromyscus maniculatus (Wagner, 1845)',
       'Peromyscus maniculatus borealis Mearns, 1911',
       'Peromyscus maniculatus angustus Hall, 1932',
       'Peromyscus maniculatus clementis Mearns, 1896',
       'Peromyscus maniculatus anticostiensis Moulthrop, 1937',
       'Peromyscus maniculatus streatori Nelson & Goldman, 1931',
       'Peromyscus maniculatus anacapae von Bloeker, 1942',
       'Hesperomys gambelii Baird, 1857',
       'Peromyscus maniculatus santacruzae Nelson & Goldman, 1931',
       'Peromyscus maniculatus georgiensis Hall, 1938',
       'Peromyscus maniculatus assimilis Nelson & Goldman, 1931',
       'Peromyscus maniculatus exterus Nelson & Goldman, 1931',
       'Peromyscus maniculatus serratus Davis, 1939',
       'Peromyscus maniculatus hollisteri Osgood, 1909',
       'Peromyscus maniculatus margaritae Osgood, 1909',
       'Peromyscus maniculatus sanctaerosae von Bloeker, 1940',
       'Peromyscus maniculatus magdalenae Osgood, 1909',
       'Perom

In [8]:
occ['verbatimScientificName'].unique()

array(['Peromyscus maniculatus sonoriensis', 'Peromyscus maniculatus',
       'Peromyscus maniculatus gracilis',
       'Peromyscus maniculatus rufinus',
       'Peromyscus maniculatus gambelii',
       'Peromyscus maniculatus abietorum',
       'Peromyscus maniculatus artemisiae',
       'Peromyscus maniculatus nubiterrae',
       'Peromyscus maniculatus blandus',
       'Peromyscus maniculatus nebrascensis (Coues, 1877)',
       'Peromyscus maniculatus saturatus Bangs, 1897',
       'Peromyscus maniculatus maniculatus',
       'Peromyscus maniculatus abietorum Bangs, 1896',
       'Peromyscus maniculatus gambeli', 'Peromyscus maniculatus luteus',
       'Peromyscus maniculatus osgoodi',
       'Peromyscus maniculatus (Wagner, 1845)',
       'Peromyscus maniculatus borealis Mearns, 1911',
       'Peromyscus maniculatus catalinae',
       'Peromyscus maniculatus rubidus',
       'Peromyscus maniculatus coolidgei',
       'Peromyscus maniculatus exiguus',
       'Peromyscus maniculatus 

#### Assess the dynamic properties field for frequency of data

In [9]:
# isolate dynamic properties for further analysis
dynamicProps = occ['dynamicProperties'].dropna().tolist()
print(f" {len(dynamicProps)} of {len(occ)} records have something in the dynamicProperties field")
print(f" proportion of records with anything in dynamicProperties: {round(len(dynamicProps) / len(occ), 4)}")

def dynamicPropertiesFormatter(prop):
    """
    Function which attempts to parse a dictionary and unify the keys from the dynamicProperties data.
    """
    try: # first try to convert it directly to a dict
        parsed_properties = eval(prop)
    except: # if it is not a dict already attempt to construct one based on common delimiters
        
        assert type(prop) is str # ensure we are working with a string
        parsed_properties = {}  # container for the parsed results

        # first split distinct key, value pairs on ";"
        dproperties = prop.split(";")
    
        # for each key, value pair within the dproperties...
        for dproperty in dproperties:
            try: # try to split key from value on first equals sign
                key, val = dproperty.split("=", 1) # maxsplit=1
            except ValueError:
                # if that fails, split key from value on first equals sign
                try:
                    key, val = dproperty.split(":", 1)
                except ValueError:
                    # if both fail, just add the entire string as a value under the key: "verbatimDynamicProperty"
                    key = 'verbatimDynamicProperty'
                    val = dproperty.lower().strip()
            
            parsed_properties[key] = val
    # iterate over each key, value pair for alignment cleaning

    formatted_results = {} # container for the formatted results
    for key, val in parsed_properties.items():
        # force keys to lower case and strip excess white space
        key = key.lower().strip()
        val = val.lower().strip()
        
        # process is producing a non-trivial amount of keys formatted similar to:
        # '{""totalLengthInmm""', or '{ "massingrams"'
        if key[0] == "{":
            key = key.lstrip('{""').rstrip('""')
        key = key.lstrip(' "')
         

        formatted_results[key] = val
    
    return formatted_results

# run dynamicPropertiesFormatter on each element from dynamicProps
formatted_props = [dynamicPropertiesFormatter(prop) for prop in dynamicProps]

# flatten the list of dicts into a list of keys included among all records
all_keys = [item for sublist in [list(elem.keys()) for elem in formatted_props] for item in sublist]

# count the frequency of keys among all records
key_freq = Counter(all_keys).most_common()

# convert the counter container to a dataframe
key_freq = pd.DataFrame.from_records(key_freq, columns=["property_name", "total_frequency"])

# calculate a pct_of_records field
pct_field_name = f'pct_of_records (n={len(dynamicProps)})'
key_freq[pct_field_name] = key_freq['total_frequency'] / len(dynamicProps)

# restrict results to those property names present in 1% or more of the records with dynamicProperties
display(key_freq[key_freq[pct_field_name] >= 0.01])

key_freq.to_csv('frequency_of_dynamicProperties.csv', index=False)

 87926 of 184306 records have something in the dynamicProperties field
 proportion of records with anything in dynamicProperties: 0.4771


Unnamed: 0,property_name,total_frequency,pct_of_records (n=87926)
0,sex,75321,0.856641
1,hind foot with claw,31149,0.354264
2,tail length,28738,0.326843
3,weight,26653,0.30313
4,ear from notch,26529,0.30172
5,total length,25297,0.287708
6,reproductive data,16127,0.183416
7,age class,12206,0.138821
8,verbatim collector,7825,0.088995
9,totallengthinmm,4228,0.048086


#### Assess the frequency of occurrences being within any Peromyscus ranges

In [10]:
# load in the range maps geopackage
gdf = geopandas.read_file("MDD_Rodentia_NAm_393species.gpkg")
print(f"gdf shape started with: {len(gdf['sciname'])} rows.")

# restrict gdf to the genus Peromyscus 
gdf = gdf[gdf['sciname'].str.startswith('Peromyscus')].copy()
print(f"Peromyscus restricted GDF has: {len(gdf['sciname'])} rows.")

  for feature in features_lst:


gdf shape started with: 393 rows.
Peromyscus restricted GDF has: 67 rows.


In [11]:
# convert occurrences into a geodataframe
occ_gdf = geopandas.GeoDataFrame(occ, geometry=geopandas.points_from_xy(occ['decimalLongitude'], occ['decimalLatitude']))
occ_gdf = occ_gdf.set_crs("EPSG:4326")

occ_gdf['within_genus'] = False
for sci in gdf['sciname'].unique():
    sub = gdf[gdf['sciname']==sci] #target taxa rank
    within_points = geopandas.sjoin(occ_gdf, sub, op = 'within').index
    occ_gdf.loc[occ_gdf.index[within_points], 'within_genus'] = True

ratio_within_genus = occ_gdf['within_genus'].astype(int).mean()
print(f"ratio of occurrences residing within one of the Peromyscus ranges: {round(ratio_within_genus, 4)}")

ratio of occurrences residing within one of the Peromyscus ranges: 0.9726
