### Combine and clean Trillium pusillum complex occurrence records

In [10]:
import numpy as np
import pandas as pd
import csv

### Load in and Organize the DWC-A formatted dataset

In [11]:
# the initial symbiota dataset from SERNEC
df = pd.read_csv('occurrences/SymbOutput_2020-10-13_101423_DwC-A/occurrences.csv',
                  encoding="iso-8859-1",low_memory=False, quoting=csv.QUOTE_ALL)

# an addendum including BRIT's texanum records
df2 = pd.read_csv('occurrences/BRIT_texanum_SymbOutput_2020-11-02_152303_DwC-A/occurrences.csv',
                  encoding="iso-8859-1",low_memory=False, quoting=csv.QUOTE_ALL)
# combine both
df = df.append(df2,ignore_index=True)
# drop any duplicates (based on the 'id' col)
df.drop_duplicates(subset='id', keep='last',inplace=True)

print(f"dataframe shaped as: {df.shape}")

dataframe shaped as: (425, 88)


#### Clean out cultivated occurrences
Very few records are caught up in this cleaning step

In [12]:
# align "0" to null (i.e., not cultivated)
df.loc[df['cultivationStatus']==0, 'cultivationStatus'] = np.nan
# remove anything with data in cultivated
df = df[df['cultivationStatus'].isna()]

# remove botanical garden occurrences (based on locality)
df = df[~df["locality"].str.lower().str.contains("botanical|garden|cultivate", na=False)]
# remove Pennsylvania occurrences
df = df[~df['stateProvince'].str.lower().str.contains('pennsylvania', na=False)]

print(f"dataframe shaped as: {df.shape}")

dataframe shaped as: (422, 88)


#### Include iNat observations
Combine SERNEC occurences with iNat observations

In [13]:
inat = pd.read_csv("occurrences/iNat_pusillum_observations_109874.csv",encoding='utf-8',
                   low_memory=False, quoting=csv.QUOTE_ALL)

# rename a few iNat columns
# lat / long are being renamed so as NOT to conflict (and therefore overwrite)
# existing geospatial data
iNat_alignment = {"latitude":"LATITUDE",
                 "longitude":"LONGITUDE",
                 "positional_accuracy":"coordinateUncertaintyInMeters",
                 "scientific_name":"scientificName"}
inat = inat.rename(iNat_alignment, axis=1)

# add and populate a new column named "source" to both the SERNEC & iNat dataframes
inat['source'] = "iNat"
df['source'] = 'SERNEC'
# merge the two dataframes
df = pd.concat([df, inat], axis=0, ignore_index=True)

print(f"dataframe shaped as: {df.shape}")

dataframe shaped as: (545, 122)


#### Clean up sci name strings

In [14]:
# first examine the current set of sciNames
print("before sci_name cleaning: ")
display(df['scientificName'].unique().tolist())
print()
def sci_name_cleaner(rowdata):
    sci_name = rowdata['scientificName'].lower()
    # establish a list of substrings to remove
    # this list was derived after examining the results of previous iterations
    filter_words = ['var.', 'michx.', 'buckl.', 'fern.', '(palmer & steyermark) steyermark']
    # replace each substring with empty string (i.e., "")
    for word in filter_words:
        if word in sci_name:
            sci_name = sci_name.replace(word, '')
    
    # basic string cleaning
    sci_name = sci_name.replace('   ', ' ')
    sci_name = sci_name.replace('  ', ' ')
    sci_name = sci_name.strip()
    sci_name = sci_name.capitalize()
    rowdata['scientificName'] = sci_name
    return rowdata

df = df.apply(sci_name_cleaner, axis=1)            

# examine set of sciNames after cleaning
print("after sci_name cleaning: ")
display(df['scientificName'].unique().tolist())

before sci_name cleaning: 


['Trillium pusillum var. virginianum',
 'Trillium pusillum',
 'Trillium pusillum var. ozarkanum',
 'Trillium pusillum var. monticulum',
 'Trillium pusillum var. pusillum',
 'Trillium pusillum var. alabamicum',
 'Trillium pusillum var. texanum',
 'Trillium ozarkanum',
 'TRILLIUM PUSILLUM MICHX.',
 'TRILLIUM PUSILLUM MICHX. var. OZARKANUM (PALMER & STEYERMARK) STEYERMARK',
 'TRILLIUM PUSILLUM MICHX. var. VIRGINIANUM FERN.',
 'TRILLIUM PUSILLUM MICHX. var. PUSILLUM',
 'Trillium pusillum var. monticolum',
 'Trillium texanum',
 'TRILLIUM TEXANUM BUCKL.',
 'Trillium pusillum ozarkanum',
 'Trillium pusillum pusillum',
 'Trillium pusillum texanum',
 'Trillium pusillum virginianum']


after sci_name cleaning: 


['Trillium pusillum virginianum',
 'Trillium pusillum',
 'Trillium pusillum ozarkanum',
 'Trillium pusillum monticulum',
 'Trillium pusillum pusillum',
 'Trillium pusillum alabamicum',
 'Trillium pusillum texanum',
 'Trillium ozarkanum',
 'Trillium pusillum monticolum',
 'Trillium texanum']

#### Clean county names & align coords to administrative centroids

In [15]:
# identify how many occurrences are missing lat/lon data
missing_qty = df.shape[0] - df[~df['decimalLatitude'].isna()].shape[0]
print(f"{missing_qty} of {df.shape[0]} rows are missing geopoints")

472 of 545 rows are missing geopoints


In [16]:
# load in the administrative centroid data

# centroid coordinates reference file
# source: https://data.healthcare.gov/dataset/Geocodes-USA-with-Counties/52wv-g36k/data
counties_ref = pd.read_csv("ref/Geocodes_USA_with_Counties.csv") #centroid coordinates
counties_ref['county'] = counties_ref['county'].str.lower() # convert to lower case for ease of alignment

# simple state name to abbreviation reference csv.
state_codes = pd.read_csv("ref/state_codes.csv") #state codes used for looking up centroids
state_codes['State'] = state_codes['State'].str.lower() # convert to lower case for ease of alignment


def clean_county_str(rowdata):
    # a function used to clean county/state names and attempt to look up a centroid based on them
    try:
        county_str = rowdata['county'].lower().strip()
        cutlist = ['co.', 'county']
        for x in cutlist:
            county_str.replace(x, '')
        county_str = county_str.rstrip()

        state_name = rowdata['stateProvince'].lower().strip()
        st_code = state_codes.loc[state_codes['State'] == state_name, 'Code'].tolist()[0]
        
        # limit the reference dataframe to only administrative entities within the
        # state/county combination.
        coords = counties_ref.loc[(counties_ref['county'] == county_str)
                                  & (counties_ref['state'] == st_code)]

        # since multiple administration levels may exist, accept the centroid as the
        # median lat/lon of all results
        lat = coords['latitude'].median()
        lon = coords['longitude'].median()

        # store the centroid coordinates to the row being processed
        rowdata['decimalLatitude'] = lat
        rowdata['decimalLongitude'] = lon
    except AttributeError as e:
        # AttributeErrors likely due to a NaN value where a string was expected
        # in this case, no changes are made to the decimalLatitude, decimalLongitude fields.
        pass
    return rowdata

# apply the clean_county_str function to the entire df row-by-row
df = df.apply(clean_county_str, axis=1)

#### Simplify & save final output

'LATITUDE' and 'LONGITUDE' were imported from the iNat records. Fill any empty lat/long data with the decimalLatitude, decimalLongitude cols. Then, simplify the output for delivery.

In [17]:
df['LATITUDE'].fillna(df['decimalLatitude'], inplace=True)
df['LONGITUDE'].fillna(df['decimalLongitude'], inplace=True)

# remove any without coords
df = df[df['LATITUDE'].notna()]

# limit the output to columns most likely to be useful downstream
keep_cols = ['LATITUDE',
            'LONGITUDE',
            'occurrenceID',
            'recordId',
            'collectionCode',
            'catalogNumber',
            'source',
            'family',
            'genus',
            'taxonID',
            'taxonRank',
            'taxonRemarks',
            'scientificName',
            'scientificNameAuthorship',
            'identificationQualifier',
            'identificationReferences',
            'identificationRemarks',
            'identifiedBy',
            'dateIdentified',
            'coordinateUncertaintyInMeters',
            'locality',
            'county']

# simplify cols
df = df[keep_cols].copy()
# save output
df.to_csv("pusillum_points.csv")
# check shape
df.shape

(471, 22)