### Dead Card Processing Notebook

In [1]:
import pandas as pd
import numpy as np
from BK_Tree import BKTree
import re
import string
from DeadProcessing import *

In [2]:
input_path = f"data/DeadRecords_PSE_results_final.csv"
input_df = pd.read_csv(input_path)

determination_key = pd.read_csv("data/NoAuthorTest.csv")
determination_key['no_author'] = determination_key['no_author'].apply(lambda x: punctStrip(x))

In [3]:
input_df = TaxonNoAuthor(input_df, 'taxon')
input_df['***no_author'] = input_df['***no_author'].apply(lambda x: punctStrip(x))
input_df['is_cultivar'] = input_df['***no_author'].apply(lambda x: detectCultivar(x))
input_df['***no_author'] = input_df.apply(lambda x: x['***no_author'].split()[0] if x['is_cultivar'] else x['***no_author'], axis=1)

### Detect cultivars

In [4]:
input_df['cultivar'] = input_df['***no_author'].apply(lambda x: detectCultivar(x))

### Determine which taxon records are already present in the DB

In [5]:
input_df['taxon_missing'] = input_df['***no_author'].apply(lambda x: True if x not in determination_key['no_author'].values else False)
taxon_to_add = input_df[input_df['taxon_missing']]['***no_author'].unique()
taxon_to_add = input_df[input_df['***no_author'].isin(taxon_to_add)]
already_in_db = input_df[~input_df['taxon_missing']]

print("Taxon not in database verbatim: {}".format(len(taxon_to_add))) 

Taxon not in database verbatim: 412


### Use the external BK_Tree rust stuff to find the closest string to a certain precision (account for minor spelling or formatting errors)

Load the tree class

In [6]:
tree = BKTree()
for name in determination_key['no_author']:
    tree.insert(name)

In [7]:
taxon_to_add['recommendation'] = taxon_to_add['***no_author'].apply(lambda x: sortOutput(tree.search(x, 2)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  taxon_to_add['recommendation'] = taxon_to_add['***no_author'].apply(lambda x: sortOutput(tree.search(x, 2)))


In [8]:
subtractions = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) > 0)]
taxon_to_add = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) == 0)]

In [9]:
subtractions

Unnamed: 0,media handling record: fileName,taxon,Taxon record: family,objectNumber,Taxon record: naturalHistoryCommonName,fieldLocCountry,briefDescription,fieldCollectionNumber,fieldCollector,fieldCollectionDateGroup,...,comment,Voucher record: Horticultural,Voucher record: Wild,binomial_match,additions_match,***no_author,is_cultivar,cultivar,taxon_missing,recommendation
53,DeadRecords_32202|DeadRecords_32203,Pteryxia terebintha Coult. Sc Rose californica...,UMBELLIFERAE,61.1334,,"U.S.A., Calif.",seed,,W. Roderick,,...,"Added from dead card, 2025:\n Permanent: 12: A...",,,Pteryxia terebintha,[],Pteryxia terebintha,False,False,True,"[(Pteryxia terebinthina, 2)]"
63,DeadRecords_32220,Pulcaria odora (L.) Rchb.,COMPOSITAE,76.0588,,Morocco,sd,1107,"Humphries,Jury,Mullins,Richardson",,...,"Added from dead card, 2025:\n Permanent: 705;\...",,,Pulcaria odora,[],Pulcaria odora,False,False,True,"[(Pulicaria odora, 1)]"
67,DeadRecords_32226,Pulsatilla alpine (L.) Schrank,RANUNCULACEAE,76.0146,,Switzerland,sd,,,,...,"Added from dead card, 2025:\n Locality: Bern: ...",,,Pulsatilla alpine,[],Pulsatilla alpine,False,False,True,"[(Pulsatilla alpina, 1)]"
72,DeadRecords_32232|DeadRecords_32233,Punica protopuncia Balf. f.,PUNICACEAE,67.0826,,Socotra Isl,seed,,,,...,"Added from dead card, 2025:\n Locality: Socotr...",,,Punica protopuncia,[],Punica protopuncia,False,False,True,"[(Punica protopunica, 2)]"
80,DeadRecords_32248|DeadRecords_32249,Putterlichia pyracantha (L.) Endl.,CELASTRACEAE,62.1018,,,seed,,,,...,"Added from dead card, 2025:",,,Putterlichia pyracantha,[],Putterlichia pyracantha,False,False,True,"[(Putterlickia pyracantha, 1)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,DeadRecords_33906|DeadRecords_33907,Rhododendron yelliotii Warb.,ERICACEAE,67.0372,Vireya section Irroratum,New Guinea,1 plt.,,Dr. Sleumer,,...,"Added from dead card, 2025:\n Preserved: VIII-...",,,Rhododendron yelliotii,[],Rhododendron yelliotii,False,False,True,"[(Rhododendron elliottii, 2)]"
979,DeadRecords_33924|DeadRecords_33925,RhododendroN BARCLAY HELEN FOX,ERICACEAE,33.0707,,hybrid,1 scion,,,,...,"Added from dead card, 2025:\n Permanent: 248;\...",,,RhododendroN BARCLAY HELEN FOX,[],RhododendroN,True,False,True,"[(Rhododendron, 1)]"
1007,DeadRecords_33980|DeadRecords_33981,Rhododendron f ETHEL STOCKER,ERICACEAE,33.0721,,hybrid,X 1 scion,,,,...,"Added from dead card, 2025:\n Preserved: M Np....",,,Rhododendron f,[],Rhododendron f,False,False,True,"[(Rhododendron, 2), (Rhododendron sp, 2)]"
1045,DeadRecords_34056|DeadRecords_34057,Rhododendronxloderi 'PINK DIAMOND,ERICACEAE,68.0152,,Hort.,,,,,...,"Added from dead card, 2025:\n Permanent: 450",,,Rhododendronxloderi 'PINK DIAMOND,[],Rhododendronxloderi,True,False,True,"[(Rhododendron loderi, 1), (Rhododendron Loder..."


In [10]:
subtractions['***no_author'] = subtractions['recommendation'].apply(lambda x: x[0][0])
subtractions = subtractions.drop('recommendation', axis=1)

already_in_db = pd.concat([subtractions, already_in_db], axis=0, ignore_index=True)
already_in_db.to_csv("output/already_in_db.csv", index=False)
taxon_to_add.to_csv("output/taxon_to_add.csv", index=False)

print("taxon already in database w/ spellcheck: {}".format(len(already_in_db)))
print("taxon to add: {}".format(len(taxon_to_add)))

taxon already in database w/ spellcheck: 740
taxon to add: 340
