# Imports

In [85]:
# Standard library imports
import os
import json
import warnings
import pickle
import ast

# Third-party library imports
import transformers
import torch
import torch.nn as nn
import torchvision
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from craft_text_detector import Craft # Need to edit the saving function to prepend 0's
from torchvision import datasets

import taxonerd
from taxonerd import TaxoNERD
import spacy

# Local application/library specific imports
import trocr
import detr

# from importlib import reload
# reload(detr)

# Directories

In [2]:
# Suppressing all the huggingface warnings
SUPPRESS = True
if SUPPRESS:
    from transformers.utils import logging
    logging.set_verbosity(40)
# Turning off this warning, isn't relevant for this application
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)

# Location of input images
inputdir = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/'
# Location of images after label extraction (also input directory to CRAFT)
# workdir = inputdir
workdir = '/projectnb/sparkgrp/ml-herbarium-grp/summer2023/kabilanm/ml-herbarium/trocr/label-extraction/data/label-extraction-intermediate-files/' # update this to the desired directory on scc

# Location of the segmentations
output_dir_craft = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/craft_output_files/'
# Location to save all output files
save_dir = '/usr4/ds549/kabilanm/saved_results/'
# For ground truth labels 
workdir2 = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/scraped-data/drago_testdata/gt_labels' # update this to the desired directory on scc

# Corpus files
ALL_SPECIES_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/corpus_taxon/output/possible_species.pkl'
ALL_GENUS_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/corpus_taxon/output/possible_genus.pkl'
# ALL_TAXON_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-new/ml-herbarium/corpus/corpus_taxon/corpus_taxon.txt'
ALL_TAXON_FILE = '/usr4/ds549/kabilanm/ml-herbarium/corpus/corpus_taxon/corpus_taxon.txt'

# Running DETR to extract labels from images

In [6]:
# Use the DETR for inference (adopted from Freddie (https://github.com/freddiev4/comp-vision-scripts/blob/main/object-detection/detr.py))
detr_model = 'KabilanM/detr-label-extraction'
# The DETR model returns the bounding boxes of the lables indentified from the images
# We will utilize the bounding boxes to rank lables in the downstream task
label_bboxes = detr.run(inputdir, detr_model, workdir)

# Save the label bounding boxes into a pickle file
pickle.dump(label_bboxes, open(save_dir+"label_boxes.pkl", "wb"))

INFO:detr-objection-detection:Getting KabilanM/detr-label-extraction pretrained model...
INFO:detr-objection-detection:Setting up object detection pipeline...
INFO:detr-objection-detection:Running object detection pipeline...
INFO:detr-objection-detection:Getting image at path /projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/1320398138.jpg...
INFO:detr-objection-detection:Now have image object <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=7319x10319 at 0x14FDC881C5E0>
INFO:root:Masking image...
INFO:root:Saved image to location: /projectnb/sparkgrp/ml-herbarium-grp/summer2023/kabilanm/ml-herbarium/trocr/label-extraction/data/label-extraction-intermediate-files/1320398138.jpg
INFO:detr-objection-detection:Getting image at path /projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/1802552799.jpg...
INFO:detr-objection-detection:Now have image object <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=6789x8982 at 0x14FD

{'1320398138.jpg': [(298, 8051, 2377, 10203),
  (2450, 8902, 4535, 9295),
  (2472, 9105, 4481, 10146),
  (4576, 8911, 6979, 10085)],
 '1802552799.jpg': [(3431, 7242, 6168, 8715)],
 '1998322454.jpg': [(2353, 4231, 3603, 4649), (2444, 5128, 3424, 5498)],
 '2236142683.jpg': [(3639, 7069, 6184, 8757)],
 '2848499425.jpg': [(131, 8519, 2441, 9027), (4191, 8007, 6291, 9002)],
 '2446828826.jpg': [(3766, 7080, 5559, 8157)],
 '2284257102.jpg': [(3978, 6097, 6134, 6548), (3531, 7575, 6214, 8871)],
 '2608680770.jpg': [(3664, 7125, 5456, 8150)],
 '2595747531.jpg': [(3238, 7707, 5463, 8506)],
 '3356834058.jpg': [],
 '2859042459.jpg': [(1534, 7410, 3318, 7919), (3359, 7124, 5165, 7978)],
 '3005750161.jpg': [(3971, 5920, 6148, 6535), (3596, 7777, 6204, 8993)],
 '1320488541.jpg': [(172, 4428, 1195, 4824), (2278, 4637, 3528, 5344)],
 '1998358368.jpg': [(111, 8595, 2475, 9680),
  (4530, 8548, 6602, 9255),
  (2565, 8926, 3966, 9680),
  (4396, 8726, 6756, 9649)],
 '1998413329.jpg': [(2495, 4703, 3731, 5276

In [7]:
# we remove images with no bounding boxes found
label_bboxes = pickle.load(open(save_dir+"label_boxes.pkl", "rb"))
keys_to_remove = []

print(f"Total number of images: {len(label_bboxes)}")

for key, value in label_bboxes.items():
    if(len(value) == 0):
        keys_to_remove.append(key)

for key in keys_to_remove:
    label_bboxes.pop(key)

print(f"Number of images with bounding boxes: {len(label_bboxes)}")

Total number of images: 251
Number of images with bounding boxes: 242


In [8]:
# these are the images with no bounding boxes
keys_to_remove

['3356834058.jpg',
 '1998550976.jpg',
 '3467354375.jpg',
 '3111515383.jpg',
 '2446819762.jpg',
 '3341239321.jpg',
 '1146138679.jpg',
 '1212567865.jpg',
 '1146376618.jpg']

# Running craft and saving the segmented images

In [None]:
# initialize the CRAFT model
craft = Craft(output_dir = output_dir_craft, 
              export_extra = False, 
              text_threshold = .7, 
              link_threshold = .4, 
              crop_type="poly", 
              low_text = .3, 
              cuda = True)

# CRAFT on images to get bounding boxes
images = []
corrupted_images = []
no_segmentations = []
boxes = {}
count= 0
img_name = []
box = []
file_types = (".jpg", ".jpeg",".png")
    
for filename in tqdm(sorted(label_bboxes.keys())):
    image = workdir+filename
    try:
        img = Image.open(image) 
        img.verify() # Check that the image is valid
        bounding_areas = craft.detect_text(image)
        if len(bounding_areas['boxes']): #check that a segmentation was found
            images.append(image)
            boxes[image] = bounding_areas['boxes']
            
        else:
            no_segmentations.append(image)
    except (IOError, SyntaxError) as e:
        corrupted_images.append(image)

# Save the bounding boxes into a pickle file
pickle.dump(boxes, open(save_dir+"boxes.pkl", "wb"))

  polys = np.array(polys)
  polys_as_ratio = np.array(polys_as_ratio)
 75%|███████▍  | 181/242 [02:49<01:10,  1.15s/it]

# Getting all the segmented images into a dataloader, and loading model and processor for trocr

In [32]:
# Deleting empty folders, which occurs if some of the images get no segementation from CRAFT
root = output_dir_craft
folders = list(os.walk(root))[1:]
deleted = []
for folder in folders:
    if not folder[2]:
        deleted.append(folder)
        os.rmdir(folder[0])
        
# Setting up the TrOCR model and processor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") # cache_dir
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

# Use all available gpus
model_gpu= nn.DataParallel(model,list(range(torch.cuda.device_count()))).to(device)

# Dataloader for working with gpus
trainset = datasets.ImageFolder(output_dir_craft, transform = processor)
testloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=False)

# For matching words to image
filenames = [s.replace('_crops', '') for s in list(trainset.class_to_idx)]

# For matching the image name with the label name
word_log_dic = {k: v for k,v in enumerate(filenames)}
# For matching the image name with the transriptions
words_identified = {k: [] for v,k in enumerate(filenames)}

# Saving the filenames, word_log_dic and words_identified

In [33]:
# Save filenames
with open(save_dir+'filenames.txt', 'w') as fp:
    for item in filenames:
        # write each item on a new line
        fp.write("%s\n" % item)
# Save word_log_dic 
with open(save_dir+'word_log_dic.json', 'w') as fp:
    json.dump(word_log_dic, fp)
# Save words_identified
with open(save_dir+'words_identified.json', 'w') as fp:
    json.dump(words_identified, fp)

# Running Tr-OCR on the Segmented Images from Craft

In [34]:
#Storing the outputs
results,confidence,labels = trocr.evaluate_craft_seg(model,processor, words_identified,word_log_dic,testloader,device)
#Saving all the outputs in dataframe
df = pd.DataFrame(list(zip(results,confidence,labels)),columns = ['Results','Confidence','Labels'])
df.to_pickle(save_dir+'full_results.pkl')
display(df.head())

Transcribing Image Segments: 100%|██████████| 158/158 [02:34<00:00,  1.02it/s]


Unnamed: 0,Results,Confidence,Labels
0,plants of California,0.000466,0
1,"Calaveras. Ranger Station, Stanislaus Forest,",0.483036,0
2,"Avery, Calaveras County.",0.299946,0
3,01237442,0.120719,0
4,no. 9132,0.391138,0


In [35]:
# First part of final csv with results, confidence level from tr-ocr, and label
df = pd.read_pickle(save_dir+'full_results.pkl')
boxes = pickle.load(open(save_dir+"boxes.pkl", "rb"))
combined_df = trocr.combine_by_label(df)

# Adding the image path and all bounding boxes 
df_dictionary = pd.DataFrame(boxes.items(), columns=['Image_Path', 'Bounding_Boxes'])
combined_df = pd.concat([combined_df, df_dictionary], axis=1, join='inner')
display(combined_df.head())

Unnamed: 0,Labels,Transcription,Transcription_Confidence,Image_Path,Bounding_Boxes
0,0,"[plants of California, Calaveras. Ranger Stati...","[0.0004661529092118144, 0.4830363392829895, 0....",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,"[[[4427.475, 7904.178], [5505.9624, 7904.178],..."
1,1,"[Flora Hawaiiansis, Collected by C. N. Forbes ...","[0.040508534759283066, 0.0575096495449543, 9.6...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,"[[[4610.7188, 7824.25], [5379.172, 7824.25], [..."
2,2,"[Dudley Herbarium of Stanford University, Plan...","[0.31743180751800537, 0.5072717070579529, 0.56...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,"[[[2165.2695, 4553.7295], [3353.666, 4544.727]..."
3,3,"[Elymius hystrix L., det. J.J.N. Campbell - 20...","[0.009591127745807171, 0.9962418079376221]",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,"[[[4248.603, 6308.9624], [4944.8623, 6308.9624..."
4,4,"[Flora of Washington, D. c., and vicinity, Buc...","[0.8306760191917419, 0.045502372086048126, 0.0...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,"[[[4121.703, 7768.3623], [5421.0874, 7768.3623..."


In [36]:
#Save intermediate file
combined_df.to_pickle(save_dir+'test.pkl')

In [37]:
combined_df.to_csv("./combined_df.csv")

# Reading in the ground truth files for tested images

In [3]:
# Reading in the ground truth values

gt_t = workdir2+'/taxon_gt.txt'
Taxon_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_t, "r", encoding="utf-8") }

gt_g = workdir2+'/geography_gt.txt'
Geography_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_g, "r", encoding="utf-8") }

gt_c = workdir2+'/collector_gt.txt'
Collector_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_c, "r", encoding="utf-8") }

comparison_file = {"Taxon":Taxon_truth,"Countries":Geography_truth,"Collector":Collector_truth}

In [4]:
Taxon_truth_sample = {k: Taxon_truth[k] for k in list(Taxon_truth)[:10]}

# view subset of the taxon truth
print(Taxon_truth_sample)

{'1697659851': 'Euphrasia officinalis', '2573258025': 'Bryoerythrophyllum recurvirostrum', '2597666444': 'Carduus tenuiflorus', '1931288980': 'Agoseris parviflora', '1930241969': 'Spiraea canescens', '1929944910': 'Chylismia scapoidea', '1931007576': 'Carex typhina', '1928514234': 'Stachys hispida', '1928658806': 'Solanum donianum', '1931124118': 'Suaeda nigra'}


# Use TaxoNERD to recognize taxons from detected text

In [97]:
ner = TaxoNERD(prefer_gpu=False) # set to "true" if GPU is accessible

In [147]:
# ! python -m spacy download en_core_web_md
# ! python -m spacy download en_core_web_sm
# ! python -m spacy download en_core_web_trf

# from numpy.linalg import norm

# # Load the spaCy model for date and location recognition
# w2v = spacy.load("en_core_web_md")

# # Convert word to its vector representation
# def word2vec(word):
#     return w2v(word).vector

# # Compute cosine similarity between two word vectors
# def cosdis(v1, v2):
#     v1_norm = norm(v1)
#     v2_norm = norm(v2)
#     if v1_norm > 0 and v2_norm > 0:
#         return np.dot(v1, v2) / (v1_norm * v2_norm)
#     else:
#         return 0.0

In [81]:
# utility functions for finding cosine similarity

def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

In [42]:
# Modify cache folder for taxonnerd (I changed the module codebase)
os.environ['TAXONERD_CACHE']='/projectnb/sparkgrp/ml-herbarium-grp/summer2023/kabilanm/ml-herbarium/trocr'
os.getenv("TAXONERD_CACHE")

'/projectnb/sparkgrp/ml-herbarium-grp/summer2023/kabilanm/ml-herbarium/trocr'

In [115]:
# ! pip install https://github.com/nleguillarme/taxonerd/releases/download/v1.5.0/en_core_eco_md-1.0.2.tar.gz

nlp = ner.load(
    model="en_core_eco_md", # en_core_eco_md
    linker="gbif_backbone",
    threshold=0 # we set the threshold to "0" so that we can collate results at various threasholds later
)

INFO:nmslib:Loading index from /projectnb/sparkgrp/ml-herbarium-grp/summer2023/kabilanm/ml-herbarium/trocr/datasets/bdb932a2c23507c7fb54dd2eff1ca3ac71005d9913e22542bb87ff92405076e5.nmslib_index.bin
INFO:nmslib:Loading regular index.
INFO:nmslib:Finished loading index
INFO:nmslib:Set HNSW query-time parameters:
INFO:nmslib:ef(Search)         =20
INFO:nmslib:algoType           =2
INFO:nmslib:Set HNSW query-time parameters:
INFO:nmslib:ef(Search)         =200
INFO:nmslib:algoType           =2


In [123]:
# use a transformer model from spaCy for person and location information
nlp_loc = spacy.load("en_core_web_trf")

In [101]:
# read dataframe from CSV file saved previously
combined_df = pd.read_csv("./combined_df.csv", index_col=0)
combined_df.shape[0]

242

In [102]:
# verify that the dataframe has been read correctly
combined_df.head()

Unnamed: 0,Labels,Transcription,Transcription_Confidence,Image_Path,Bounding_Boxes
0,0,"['plants of California', 'Calaveras. Ranger St...","[0.0004661529092118144, 0.4830363392829895, 0....",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4427.475 7904.178 ]\n [5505.9624 7904.178...
1,1,"['Flora Hawaiiansis', 'Collected by C. N. Forb...","[0.040508534759283066, 0.0575096495449543, 9.6...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4610.7188 7824.25 ]\n [5379.172 7824.25 ...
2,2,"['Dudley Herbarium of Stanford University', 'P...","[0.31743180751800537, 0.5072717070579529, 0.56...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[2165.2695 4553.7295]\n [3353.666 4544.727...
3,3,"['Elymius hystrix L.', 'det. J.J.N. Campbell -...","[0.009591127745807171, 0.9962418079376221]",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4248.603 6308.9624]\n [4944.8623 6308.962...
4,4,"['Flora of Washington, D. c., and vicinity', '...","[0.8306760191917419, 0.045502372086048126, 0.0...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4121.703 7768.3623]\n [5421.0874 7768.362...


In [None]:
# use TaxoNERD for entity recognition and linking against the GBIF database

taxon_output = []
location_output = []
confidence_output = []

# predict taxons for text detected from each image
for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    # Convert the strings in the 'list_column' to actual lists
    temp = ast.literal_eval(row["Transcription"])

    # construct a single string out of all the detected text
    input_text = " ".join(temp)
    doc = ner.find_in_text(input_text)
    entities = []

    if(input_text == ""):
        taxon_output.append("")
        confidence_output.append(float(0))
        continue
        
    try:
        # append linked taxon entity with the highest confidence
        for entity in doc.entity:
            entities.append(entity[0])

        result = max(entities, key=lambda x: x[2])        
        taxon_output.append(str(result[1]))
        confidence_output.append(float(result[2]))

    except AttributeError:
        # append empty strings when no entity is detected
        taxon_output.append("")
        confidence_output.append(float(0))

In [None]:
# use spaCy model to recognize date and location from the text

location_output = []
date_output = []

# predict taxons for text detected from each image
for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    # Convert the strings in the 'list_column' to actual lists
    temp = ast.literal_eval(row["Transcription"])

    # construct a single string out of all the detected text
    input_text = " ".join(temp)
    doc_loc = nlp_loc(input_text)
    entities = []
    loc_entities = []
    date_entities = []

    if(input_text == ""):
        location_output.append("")
        date_output.append("")
        continue

# append location and date entities recognized in the text
    for ent in doc_loc.ents:
        if(ent.label_ == "LOC"): 
            loc_entities.append(ent.text)
        if(ent.label_ == "DATE"):
            date_entities.append(ent.text)
    # print(loc_entities, date_entities)

# Need to group the locations and dates found based on the label they were found in

In [52]:
# append predicted taxon and confidence scores to the dataframe
combined_df["Taxon_Output"] = taxon_output
combined_df["Confidence_Output"] = confidence_output

In [53]:
combined_df.head()

Unnamed: 0,Labels,Transcription,Transcription_Confidence,Image_Path,Bounding_Boxes,Taxon_Output,Confidence_Output
0,0,"['plants of California', 'Calaveras. Ranger St...","[0.0004661529092118144, 0.4830363392829895, 0....",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4427.475 7904.178 ]\n [5505.9624 7904.178...,,0.0
1,1,"['Flora Hawaiiansis', 'Collected by C. N. Forb...","[0.040508534759283066, 0.0575096495449543, 9.6...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4610.7188 7824.25 ]\n [5379.172 7824.25 ...,Clermontia persicifolia,1.0
2,2,"['Dudley Herbarium of Stanford University', 'P...","[0.31743180751800537, 0.5072717070579529, 0.56...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[2165.2695 4553.7295]\n [3353.666 4544.727...,,0.0
3,3,"['Elymius hystrix L.', 'det. J.J.N. Campbell -...","[0.009591127745807171, 0.9962418079376221]",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4248.603 6308.9624]\n [4944.8623 6308.962...,Elymus hystrix hystrix,0.777422
4,4,"['Flora of Washington, D. c., and vicinity', '...","[0.8306760191917419, 0.045502372086048126, 0.0...",/projectnb/sparkgrp/ml-herbarium-grp/summer202...,[[[4121.703 7768.3623]\n [5421.0874 7768.362...,,0.0


In [82]:
combined_df = pickle.read()
# array to store computed similarity scores
cosine_sim = []

for index, row in combined_df.iterrows():

    # extract image name from the dataframe
    img_name = row["Image_Path"].split("/")[-1][:-4]
    taxon_predicted = row["Taxon_Output"]
    taxon_gt = Taxon_truth[img_name]

    # print(f"Image: {img_name}, Predicted: {taxon_predicted}, Truth: {taxon_gt}")

    # compute cosine similarity between the predicted taxon and ground truth
    try:
        sim = cosdis(word2vec(taxon_gt), word2vec(taxon_predicted))
        cosine_sim.append(sim)
        # print(taxon_gt, taxon_predicted, sim)

    except ZeroDivisionError:
        cosine_sim.append(0)
        # print(taxon_gt, taxon_predicted,"0")

# append similarity scores to the dataframe
combined_df["Cosine_Similarity"] = cosine_sim
combined_df.to_pickle(save_dir+"full_results_with_cossim.pkl")

In [146]:
combined_df = pickle.load(open(save_dir+"full_results_with_cossim.pkl", "rb"))
final_taxon_prediction = pd.DataFrame(columns=["Confidence_Threshold", "Num_Taxons_Correct", 
                                               "Num_Taxons_Total", "Taxons_Accuracy_Predicted"])
temp_df = pd.DataFrame()

# generate list of similarity thresholds
# sim_threshold = [0.9]
sim_threshold = 0.8

# generate list of confidence thresholds
confidence_threshold = np.arange(0, 1.001, 0.1)

# compute prediction accuracy at each confidence threshold
for conf_threshold in confidence_threshold:
    
    temp_df = combined_df[(combined_df["Confidence_Output"] >= conf_threshold)]
    # print(len(temp_df))
    
    acc_count = (temp_df["Cosine_Similarity"] >= sim_threshold).sum()

    acc_value = acc_count/len(temp_df)

    temp = [conf_threshold, acc_count, len(temp_df), acc_value]
    final_taxon_prediction.loc[len(final_taxon_prediction)] = temp

display(final_taxon_prediction)

Unnamed: 0,Confidence_Threshold,Num_Taxons_Correct,Num_Taxons_Total,Taxons_Accuracy_Predicted
0,0.0,96.0,242.0,0.396694
1,0.1,96.0,168.0,0.571429
2,0.2,96.0,168.0,0.571429
3,0.3,96.0,168.0,0.571429
4,0.4,96.0,168.0,0.571429
5,0.5,95.0,156.0,0.608974
6,0.6,91.0,142.0,0.640845
7,0.7,83.0,126.0,0.65873
8,0.8,67.0,97.0,0.690722
9,0.9,39.0,58.0,0.672414


1. We first obtain the taxon predictions with a confidence score for each taxon.
2. We then compute cosine similarities of the predicted taxons with the ground truth taxons.
3. We then, at each interval of the confidence threashold, compute number of taxons that have a high cosine similarity with the ground truth. The scores above are computed for a specific cosine similarity score ">0.8". We need to perform this step because, the taxons are matched against entries from the `gbif_backbone` database (as part of TaxoNERD) and, the predicted taxon might not exactly match the ground truth and we are accounting for this using cosine similarity.

We can try to use the GBIF database to predict taxons and also experiment with different thresholds for the cosine similarity scores. But, in general, the chosen cosine similarity threshold offers an incremental performance upgrade compared to the last semester's work.