In [1]:
import pandas as pd
import numpy as np
from BK_Tree import BKTree
import re
import string
from dead_processing_utils import *
from file_processing_utils import *

### Load in all the dead records as well as the determination key from the db

In [2]:
folder_path = 'dead_records_data'
input_df = concat_df(folder_path)
print(input_df.columns)

determination_key = pd.read_csv("data/NoAuthorTest.csv")
determination_key['no_author'] = determination_key['no_author'].apply(lambda x: punctStrip(x))

Index(['media handling record: fileName', 'taxon', 'Taxon record: family',
       'objectNumber', 'Taxon record: naturalHistoryCommonName',
       'fieldLocCountry', 'briefDescription', 'fieldCollector',
       'fieldCollectionNumber', 'fieldCollectionDateGroup', 'source',
       'accessionDate', 'Voucher record: Wild',
       'Voucher record: Horticultural', 'reference',
       'referenceGroup_reference', 'Distribution record: depositor_Plant',
       'Distribution record: depositor_Seed', 'notes', 'deadFlag',
       'flowerColor', 'comment'],
      dtype='object')


### Determine the amount of dead card taxon already in the database verbatim 

In [3]:
input_df = TaxonNoAuthor(input_df, 'taxon')
input_df['***no_author'] = input_df['***no_author'].apply(lambda x: punctStrip(x))
input_df['is_cultivar'] = input_df['***no_author'].apply(lambda x: detectCultivar(x))
input_df['***no_author'] = input_df.apply(lambda x: x['***no_author'].split()[0] if x['is_cultivar'] else x['***no_author'], axis=1)

In [4]:
input_df['taxon_missing'] = input_df['***no_author'].apply(lambda x: True if x not in determination_key['no_author'].values else False)
taxon_to_add = input_df[input_df['taxon_missing']]['***no_author'].unique()
taxon_to_add = input_df[input_df['***no_author'].isin(taxon_to_add)]
already_in_db = input_df[~input_df['taxon_missing']]

print("Total taxon: {}".format(len(input_df['***no_author'].unique())))
print("Taxon not in database verbatim: {}".format(len(taxon_to_add)))

Total taxon: 13014
Taxon not in database verbatim: 9169


### Post-processing before spell-check step

In [5]:
taxon_to_add['***no_author_processed'] = taxon_to_add["***no_author"].apply(lambda x: stripSp(x)).apply(lambda x: fixSsp(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  taxon_to_add['***no_author_processed'] = taxon_to_add["***no_author"].apply(lambda x: stripSp(x)).apply(lambda x: fixSsp(x))


### Create a list of all unique taxon_to_add

In [6]:
taxon_to_add = taxon_to_add.drop_duplicates(['***no_author_processed']).reset_index(drop=True)

### Create a best guess taxon key using the Bk Tree

In [7]:
taxaTree = BKTree()
for name in determination_key['no_author']:
    taxaTree.insert(name)

In [8]:
taxon_to_add['recommendation'] = taxon_to_add['***no_author_processed'].apply(lambda x: sortOutput(taxaTree.search(x, 2)))

In [9]:
subtractions = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) > 0)]
taxon_to_add = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) == 0)]

In [10]:
subtractions['***no_author'] = subtractions['recommendation'].apply(lambda x: x[0][0])
subtractions = subtractions.drop('recommendation', axis=1)

already_in_db = pd.concat([subtractions, already_in_db], axis=0, ignore_index=True)

print("taxon already in database w/ spellcheck: {}".format(len(already_in_db)))
print("taxon to add: {}".format(len(taxon_to_add)))

taxon already in database w/ spellcheck: 13320
taxon to add: 6215


### Create a complete key with both taxon guesses and known taxon already in the db

In [11]:
final_taxon_key = pd.DataFrame()
final_taxon_key['deadcard_taxon'] = already_in_db['taxon']
final_taxon_key = final_taxon_key.drop_duplicates().reset_index(drop=True)
final_taxon_key['no_author_recommendation'] = already_in_db['***no_author']
final_taxon_key['db_taxon'] = final_taxon_key['no_author_recommendation'].apply(lambda x: determination_key[determination_key['no_author'] == x]['0'].values[0])

final_taxon_key.to_csv("output/final_taxon_key.csv", index=False)

### Create a list of all unique taxon that aren't in the database

In [14]:
taxon_to_add = taxon_to_add[['taxon', '***no_author']].drop_duplicates().reset_index(drop=True)
taxon_to_add["***no_author"] = taxon_to_add["***no_author"].apply(lambda x: x.replace(" var ", " var. ").replace(" ssp ", " subsp. ").replace(" cv ", " cv. "))
taxon_to_add.to_csv("output/taxon_to_add.csv", index=False)

print(len(taxon_to_add))

6215
