### Dead Card Processing Notebook

In [1]:
import pandas as pd
import numpy as np
from BK_Tree import BKTree
import re
import string
from DeadProcessing import *

In [2]:
input_path = f"data/DeadRecords_PSE_results_final.csv"
input_df = pd.read_csv(input_path)

determination_key = pd.read_csv("data/NoAuthorTest.csv")
determination_key['no_author'] = determination_key['no_author'].apply(lambda x: x.translate(string.punctuation).strip())

### Determine which taxon records are already present in the DB

In [3]:
input_df = TaxonNoAuthor(input_df, 'taxon')

input_df['taxon_missing'] = input_df['***no_author'].apply(lambda x: True if x not in determination_key['no_author'].values else False)
taxon_to_add = input_df[input_df['taxon_missing']]['***no_author'].unique()
taxon_to_add = input_df[input_df['***no_author'].isin(taxon_to_add)]
already_in_db = input_df[~input_df['taxon_missing']]

print("Taxon not in database verbatim: {}".format(len(taxon_to_add))) 

Taxon not in database verbatim: 502


### Use the external BK_Tree rust stuff to find the closest string to a certain precision (account for minor spelling or formatting errors)

Load the tree class

In [4]:
tree = BKTree()
for name in determination_key['no_author']:
    tree.insert(name)

In [5]:
taxon_to_add['recommendation'] = taxon_to_add['***no_author'].apply(lambda x: sortOutput(tree.search(x, 2)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  taxon_to_add['recommendation'] = taxon_to_add['***no_author'].apply(lambda x: sortOutput(tree.search(x, 2)))


In [6]:
subtractions = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) > 0)]
taxon_to_add = taxon_to_add[taxon_to_add['recommendation'].apply(lambda x: len(x) == 0)]

In [7]:
subtractions

Unnamed: 0,media handling record: fileName,taxon,Taxon record: family,objectNumber,Taxon record: naturalHistoryCommonName,fieldLocCountry,briefDescription,fieldCollectionNumber,fieldCollector,fieldCollectionDateGroup,...,deadFlag,flowerColor,comment,Voucher record: Horticultural,Voucher record: Wild,binomial_match,additions_match,***no_author,taxon_missing,recommendation
53,DeadRecords_32202|DeadRecords_32203,Pteryxia terebintha Coult. Sc Rose californica...,UMBELLIFERAE,61.1334,,"U.S.A., Calif.",seed,,W. Roderick,,...,True,,"Added from dead card, 2025:\n Permanent: 12: A...",,,Pteryxia terebintha,[],Pteryxia terebintha,True,"[(Pteryxia terebinthina, 2)]"
63,DeadRecords_32220,Pulcaria odora (L.) Rchb.,COMPOSITAE,76.0588,,Morocco,sd,1107,"Humphries,Jury,Mullins,Richardson",,...,True,,"Added from dead card, 2025:\n Permanent: 705;\...",,,Pulcaria odora,[],Pulcaria odora,True,"[(Pulicaria odora, 1)]"
67,DeadRecords_32226,Pulsatilla alpine (L.) Schrank,RANUNCULACEAE,76.0146,,Switzerland,sd,,,,...,True,,"Added from dead card, 2025:\n Locality: Bern: ...",,,Pulsatilla alpine,[],Pulsatilla alpine,True,"[(Pulsatilla alpina, 1)]"
72,DeadRecords_32232|DeadRecords_32233,Punica protopuncia Balf. f.,PUNICACEAE,67.0826,,Socotra Isl,seed,,,,...,True,,"Added from dead card, 2025:\n Locality: Socotr...",,,Punica protopuncia,[],Punica protopuncia,True,"[(Punica protopunica, 2)]"
80,DeadRecords_32248|DeadRecords_32249,Putterlichia pyracantha (L.) Endl.,CELASTRACEAE,62.1018,,,seed,,,,...,True,,"Added from dead card, 2025:",,,Putterlichia pyracantha,[],Putterlichia pyracantha,True,"[(Putterlickia pyracantha, 1)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,DeadRecords_33904|DeadRecords_33905,Rhododendron yelliotii Warb.,ERICACEAE,66.0195,,New Guinea,1 plt.,,Sleumer,,...,True,,"Added from dead card, 2025:\n Preserved: 10-VI...",,,Rhododendron yelliotii,[],Rhododendron yelliotii,True,"[(Rhododendron elliottii, 2)]"
970,DeadRecords_33906|DeadRecords_33907,Rhododendron yelliotii Warb.,ERICACEAE,67.0372,Vireya section Irroratum,New Guinea,1 plt.,,Dr. Sleumer,,...,True,,"Added from dead card, 2025:\n Preserved: VIII-...",,,Rhododendron yelliotii,[],Rhododendron yelliotii,True,"[(Rhododendron elliottii, 2)]"
1007,DeadRecords_33980|DeadRecords_33981,Rhododendron f ETHEL STOCKER,ERICACEAE,33.0721,,hybrid,X 1 scion,,,,...,True,,"Added from dead card, 2025:\n Preserved: M Np....",,,Rhododendron f,[],Rhododendron f,True,"[(Rhododendron, 2), (Rhododendron sp, 2)]"
1026,DeadRecords_34018|DeadRecords_34019,Rhododendron X loderi,ERICACEAE,32.1766,,hybrid,,,,,...,True,,"Added from dead card, 2025:\n Preserved: M 73;...",,,Rhododendron X loderi,[],Rhododendron X loderi,True,"[(Rhododendron loderi, 2)]"


In [None]:
subtractions['***no_author'] = subtractions['recommendation'].apply(lambda x: x[0][0])
subtractions = subtractions.drop('recommendation', axis=1)

already_in_db = pd.concat([subtractions, already_in_db], axis=0, ignore_index=True)
already_in_db.to_csv("output/already_in_db.csv", index=False)
taxon_to_add.to_csv("output/taxon_to_add.csv", index=False)

print("taxon already in database w/ spellcheck: {}".format(len(already_in_db)))
print("taxon to add: {}".format(len(taxon_to_add)))

          media handling record: fileName  \
53    DeadRecords_32202|DeadRecords_32203   
63                      DeadRecords_32220   
67                      DeadRecords_32226   
72    DeadRecords_32232|DeadRecords_32233   
80    DeadRecords_32248|DeadRecords_32249   
...                                   ...   
969   DeadRecords_33904|DeadRecords_33905   
970   DeadRecords_33906|DeadRecords_33907   
1007  DeadRecords_33980|DeadRecords_33981   
1026  DeadRecords_34018|DeadRecords_34019   
1075  DeadRecords_34116|DeadRecords_34117   

                                                  taxon Taxon record: family  \
53    Pteryxia terebintha Coult. Sc Rose californica...         UMBELLIFERAE   
63                            Pulcaria odora (L.) Rchb.           COMPOSITAE   
67                       Pulsatilla alpine (L.) Schrank        RANUNCULACEAE   
72                          Punica protopuncia Balf. f.           PUNICACEAE   
80                   Putterlichia pyracantha (L.) Endl.    