# Imports

In [14]:
#Imports and installs
import transformers
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import transformers
from craft_text_detector import Craft # Need to edit the saving function to prepend 0's
import requests 
import torch
import os, random
from PIL import Image,ImageFilter
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd

import numpy as np
import imghdr
import pickle
from pathlib import Path
import cv2
import torch.nn.functional as F
import multiprocessing
from functools import partial
import json

import matplotlib.pyplot as plt
import warnings
import time

import trocr
import matching
import predictions
import results

from taxonerd import TaxoNERD

# Directories

In [2]:
# Suppressing all the huggingface warnings
SUPPRESS = True
if SUPPRESS:
    from transformers.utils import logging
    logging.set_verbosity(40)
# Turning off this warning, isn't relevant for this application
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)

# Location of images
workdir = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/' # update this to the desired directory on scc
# Location of the segmentations
output_dir_craft = '/projectnb/sparkgrp/kabilanm/goodfilescraft/'
# Location to save all output files
save_dir = '/usr4/ds549/kabilanm/saved_results/'
# For ground truth labels 
workdir2 = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/scraped-data/drago_testdata/gt_labels' # update this to the desired directory on scc

# Corpus files
ALL_SPECIES_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/corpus_taxon/output/possible_species.pkl'
ALL_GENUS_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/corpus_taxon/output/possible_genus.pkl'
# ALL_TAXON_FILE = '/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-new/ml-herbarium/corpus/corpus_taxon/corpus_taxon.txt'
ALL_TAXON_FILE = '/usr4/ds549/kabilanm/ml-herbarium/corpus/corpus_taxon/corpus_taxon.txt'

# Running craft and saving the segmented images

In [3]:
count = 0
for filename in tqdm(sorted(os.listdir(workdir))):
    count +=1

print(count)

100%|██████████| 254/254 [00:00<00:00, 2536555.28it/s]

254





In [4]:
# initialize the CRAFT model
craft = Craft(output_dir = output_dir_craft,export_extra = False, text_threshold = .7,link_threshold = .4, crop_type="poly",low_text = .3,cuda = True)

# CRAFT on images to get bounding boxes
images = []
corrupted_images = []
no_segmentations = []
boxes = {}
count= 0
img_name = []
box = []
file_types = (".jpg", ".jpeg",".png")
    
for filename in tqdm(sorted(os.listdir(workdir))):
    if filename.endswith(file_types):
        image = workdir+filename
        try:
            img = Image.open(image) 
            img.verify() # Check that the image is valid
            bounding_areas = craft.detect_text(image)
            if len(bounding_areas['boxes']): #check that a segmentation was found
                images.append(image)
                boxes[image] = bounding_areas['boxes']
                
            else:
                no_segmentations.append(image)
        except (IOError, SyntaxError) as e:
            corrupted_images.append(image)

  polys = np.array(polys)
  polys_as_ratio = np.array(polys_as_ratio)
100%|██████████| 254/254 [11:06<00:00,  2.62s/it]


# Getting all the segmented images into a dataloader, and loading model and processor for trocr

In [5]:
# Deleting empty folders, which occurs if some of the images get no segementation from CRAFT
root = output_dir_craft
folders = list(os.walk(root))[1:]
deleted = []
for folder in folders:
    if not folder[2]:
        deleted.append(folder)
        os.rmdir(folder[0])
        
# Setting up the Tr-OCR model and processor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") 
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

# Use all available gpu's
model_gpu= nn.DataParallel(model,list(range(torch.cuda.device_count()))).to(device)

# Dataloader for working with gpu's
trainset = datasets.ImageFolder(output_dir_craft, transform = processor)
testloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=False)

# For matching words to image
filenames = [s.replace('_crops', '') for s in list(trainset.class_to_idx)]

# For matching the image name with the label name
word_log_dic = {k: v for k,v in enumerate(filenames)}
# For matching the image name with the transriptions
words_identified = {k: [] for v,k in enumerate(filenames)}

# Saving the filenames, word_log_dic and words_identified

In [6]:
# Save filenames
with open(save_dir+'filenames.txt', 'w') as fp:
    for item in filenames:
        # write each item on a new line
        fp.write("%s\n" % item)
# Save word_log_dic 
with open(save_dir+'word_log_dic.json', 'w') as fp:
    json.dump(word_log_dic, fp)
# Save words_identified
with open(save_dir+'words_identified.json', 'w') as fp:
    json.dump(words_identified, fp)

# Running Tr-OCR on the Segmented Images from Craft

In [7]:
#Storing the outputs
results,confidence,labels = trocr.evaluate_craft_seg(model,processor, words_identified,word_log_dic,testloader,device)
#Saving all the outputs in dataframe
df = pd.DataFrame(list(zip(results,confidence,labels)),columns = ['Results','Confidence','Labels'])
df.to_pickle(save_dir+'full_results.pkl')

Transcribing Image Segments: 100%|██████████| 341/341 [04:57<00:00,  1.15it/s]


In [8]:
# First part of final csv with results, confidence level from tr-ocr, and label
combined_df = trocr.combine_by_label(df)

# Adding the image path and all bounding boxes 

df_dictionary = pd.DataFrame(boxes.items(), columns=['Image_Path', 'Bounding_Boxes'])
combined_df = pd.concat([combined_df, df_dictionary], axis=1, join='inner')
display(combined_df.head())

Unnamed: 0,Labels,Transcription,Transcription_Confidence,Image_Path,Bounding_Boxes
0,0,"[Museum d'Histoire naturelle de Paris, Herbier...","[0.8803507685661316, 0.9324854016304016, 0.997...",/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,"[[[3858.9062, 280.7068], [5929.0693, 357.37952..."
1,1,"[100s, 100p, 0, 100p., 2nd, 100,, top, 100, 10...","[0.03950851410627365, 0.061212457716464996, 0....",/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,"[[[6485.1157, 425.71875], [6612.831, 425.71875..."
2,2,"[0-, 100,, ed state., 1627083, United States n...","[0.039854906499385834, 0.11935094743967056, 0....",/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,"[[[6399.1187, 307.38126], [6524.8657, 307.3812..."
3,3,"[us, 8.810, own, copyright reserved, 1685951, ...","[0.361989825963974, 0.2914508879184723, 0.2818...",/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,"[[[3286.3594, 168.53125], [3539.1562, 168.5312..."
4,4,"[100, flora of the, Washington Baltimore area,...","[0.05119089409708977, 0.6536547541618347, 0.98...",/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,"[[[6408.428, 710.46875], [6536.3125, 710.46875..."


In [11]:
#Save intermediate file
combined_df.to_pickle(save_dir+'/test.pkl')

# Use TaxoNERD to recognize taxons from detected text

In [19]:
taxonerd = TaxoNERD(prefer_gpu=False) # set to "true" if GPU is accessible

nlp = taxonerd.load(
    model="en_core_eco_biobert", 
    linker="ncbi_taxonomy", 
    threshold=0
)

In [32]:
count = 0
output = []
taxon_output = []

# test "gbif_backbone" linker -> more species here
# use BERT for person and location

# predict taxons for text detected from each image
for index, row in combined_df.iterrows():
    temp = row["Transcription"]
    
    # construct a single string out of all the detected text
    input_text = " ".join(temp) 
    doc = taxonerd.find_in_text(input_text)
    
    try:
        print("Linked name: ", doc.entity[0][0][1], " -> Confidence: ", doc.entity[0][0][2])
        count+=1
        
        # append linked taxon entity
        taxon_output.append(str(doc.entity[0][0][1]))
        
    except AttributeError:
        # append empty strings when no entity is detected
        taxon_output.append("") 

Linked name:  Ferraria  -> Confidence:  0.6746945977210999
Linked name:  Chlopsis fierasfer  -> Confidence:  0.448846697807312
Linked name:  Clermontia persicifolia  -> Confidence:  0.6184449195861816
Linked name:  Elymus hystrix L.  -> Confidence:  0.7790964841842651
Linked name:  Bucculatrix clavenae  -> Confidence:  0.38107770681381226
Linked name:  Hohenbergia antillana Mez  -> Confidence:  0.9376682043075562
Linked name:  Nectria austroamericana  -> Confidence:  0.6761608719825745
Linked name:  Abbottella abbotti  -> Confidence:  0.6212729811668396
Linked name:  Hysterium herbarum  -> Confidence:  0.5404318571090698
Linked name:  Plasmodium hispaniolae  -> Confidence:  0.7043249011039734
Linked name:  Argythamnia  -> Confidence:  0.6870838403701782
Linked name:  Tripsacum maizar  -> Confidence:  0.7003204226493835
Linked name:  Drymaria cordata  -> Confidence:  0.5393434166908264
Linked name:  Phyllanthus acidus  -> Confidence:  0.7807743549346924
Linked name:  Asclepias subvertic

In [33]:
print(count)
print(len(taxon_output))

222
249


In [44]:
combined_df["Taxon_Output"] = taxon_output

combined_df[["Image_Path", "Taxon_Output"]].head()

Unnamed: 0,Image_Path,Taxon_Output
0,/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,Ferraria
1,/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,Chlopsis fierasfer
2,/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,Clermontia persicifolia
3,/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,
4,/projectnb/sparkgrp/ml-herbarium-grp/ml-herbar...,Elymus hystrix L.


In [37]:
print(taxon_output[:10])

['Ferraria', 'Chlopsis fierasfer', 'Clermontia persicifolia', '', 'Elymus hystrix L.', 'Bucculatrix clavenae', 'Hohenbergia antillana Mez', 'Nectria austroamericana', 'Abbottella abbotti', '']


# Reading in the ground truth files for tested images

In [35]:
# Reading in the ground truth values

gt_t = workdir2+'/taxon_gt.txt'
Taxon_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_t) }

gt_g = workdir2+'/geography_gt.txt'
Geography_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_g) }

gt_c = workdir2+'/collector_gt.txt'
Collector_truth = { line.split(":")[0] : line.split(": ")[1].strip() for line in open(gt_c) }

comparison_file = {"Taxon":Taxon_truth,"Countries":Geography_truth,"Collector":Collector_truth}

In [73]:
Taxon_truth_sample = {k: Taxon_truth[k] for k in list(Taxon_truth)[:10]}

# view subset of the taxon truth
print(Taxon_truth_sample)

{'1697659851': 'Euphrasia officinalis', '2573258025': 'Bryoerythrophyllum recurvirostrum', '2597666444': 'Carduus tenuiflorus', '1931288980': 'Agoseris parviflora', '1930241969': 'Spiraea canescens', '1929944910': 'Chylismia scapoidea', '1931007576': 'Carex typhina', '1928514234': 'Stachys hispida', '1928658806': 'Solanum donianum', '1931124118': 'Suaeda nigra'}


In [53]:
# utility functions for finding cosine similarity

def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]

# Checking Accuracy Against Taxon Truth File

In [78]:
# generate list of similarity thresholds
sim_threshold = np.arange(0.1, 1, 0.1)

accuracy_at_different_thresh = []

# compute similarity scores
cosine_sim = []

for index, row in combined_df.iterrows():
    
    # extract image name from the dataframe
    img_name = row["Image_Path"].split("/")[-1][:-4]
    taxon_predicted = row["Taxon_Output"]
    taxon_gt = Taxon_truth[img_name]
    
    # compute cosine similarity between the predicted taxon and ground truth
    try:
        sim = cosdis(word2vec(taxon_gt), word2vec(taxon_predicted))
        cosine_sim.append(sim)
        # print(taxon_gt, taxon_predicted, sim)
        
    except ZeroDivisionError:
        cosine_sim.append(0)
        # print(taxon_gt, taxon_predicted,"0")

# append similarity scores to the dataframe
combined_df["Cosine_Similarity"] = cosine_sim

final_taxon_prediction = pd.DataFrame(columns=["Threshold", "Taxons_Predicted"])

#  compute prediction accuracy at each similarity threshold
for threshold in sim_threshold:
    acc_count = (combined_df["Cosine_Similarity"] > threshold).sum()
    acc_val = acc_count/combined_df.shape[0]    
    
    temp = [threshold, acc_count]
    final_taxon_prediction.loc[len(final_taxon_prediction)] = temp
    
final_taxon_prediction

Unnamed: 0,Threshold,Taxons_Predicted,Taxon_Accuracy_Predicted
0,0.1,222.0,0.891566
1,0.2,221.0,0.88755
2,0.3,219.0,0.879518
3,0.4,208.0,0.835341
4,0.5,192.0,0.771084
5,0.6,167.0,0.670683
6,0.7,131.0,0.526104
7,0.8,95.0,0.381526
8,0.9,66.0,0.26506
