In [9]:
import os
import shutil
from collections import defaultdict
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torchvision.transforms import transforms
from torchvision.transforms.functional import pad

from utils.utils import *
from craft_text_detector import Craft

from transformers import (TrOCRProcessor, 
                        TrOCRForCausalLM, 
                        VisionEncoderDecoderModel)

In [6]:
# --in-dir
# --out-dir
# --model-path

In [None]:
workdir = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/"
output_dir_craft = "data/Doc_Classification/input"

# initialize the CRAFT model
craft = Craft(output_dir = output_dir_craft, 
              export_extra = False, 
              text_threshold = .7, 
              link_threshold = .4, 
              crop_type="poly", 
              low_text = .3, 
              cuda = True)

# CRAFT on images to get bounding boxes
images = []
corrupted_images = []
no_segmentations = []
boxes = {}
count= 0
img_name = []
box = []
file_types = (".jpg", ".jpeg",".png")
    
for filename in tqdm(sorted(os.listdir(workdir))):
    if filename.endswith(file_types):
        image = workdir+filename
        try:
            img = Image.open(image) 
            img.verify() # Check that the image is valid
            bounding_areas = craft.detect_text(image)
            if len(bounding_areas['boxes']): #check that a segmentation was found
                images.append(image)
                boxes[image] = bounding_areas['boxes']
                
            else:
                no_segmentations.append(image)
        except (IOError, SyntaxError) as e:
            corrupted_images.append(image)

In [34]:
# preprocess = transforms.Compose([
#     transforms.Resize([100, ]),
#     transforms.Lambda(
#         lambda img: pad(img, padding=(0, 0, max(0, 200 - img.width), max(0, 100 - img.height)), 
#                                               fill=(255, 255, 255))),
#     transforms.CenterCrop((100, 200)),
#     PartialErosion(iterations=2),
#     transforms.ToTensor(),
#     transforms.Grayscale(num_output_channels=3),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
# ])

In [7]:
# Move the model to the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
device

device(type='cuda')

In [14]:
# # vgg_from_scratch_w_erosion_v2.pth

# best_model = models.densenet121(weights="DenseNet121_Weights.IMAGENET1K_V1")
# # Modify the model for binary classification
# num_ftrs = best_model.classifier.in_features
# best_model.classifier = nn.Linear(num_ftrs, 1)

# # best_model = VGG16Binary(input_shape=(3, 100, 200), num_classes=1)  # Create an instance of the model

# best_model.load_state_dict(torch.load('model/densenet_10_5_layers_unlocked.pth'))
# best_model = best_model.to(device)

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-stage1')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-stage1')

# Freeze TrOCR layers
for param in model.parameters():
    param.requires_grad = False

# Define our classifier
classifier = nn.Sequential(
    
    nn.Conv2d(1, 16, kernel_size=1, stride=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(16, 32, kernel_size=1, stride=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(32, 32, kernel_size=1, stride=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(2, 2),
    nn.Flatten(),
    nn.Linear(32 * (577 // 8) * (1024 // 8), 512),
    nn.ReLU(inplace=True),
    nn.Linear(512, 512),
    nn.ReLU(inplace=True),
    nn.Dropout(0.2),
    nn.Linear(512, 256),
    nn.ReLU(inplace=True),
    nn.Linear(256, 1)
)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-stage1 and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
classifier = torch.nn.DataParallel(classifier, [0]) # list(range(torch.cuda.device_count()))
classifier.load_state_dict(torch.load("model/TrOCR_L_enc_feature_extraction_w_classifier.pth"))

<All keys matched successfully>

In [17]:
model.to(device)
classifier.to(device)

DataParallel(
  (module): DataParallel(
    (module): Sequential(
      (0): Conv2d(1, 16, kernel_size=(1, 1), stride=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
      (7): ReLU(inplace=True)
      (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (9): Flatten(start_dim=1, end_dim=-1)
      (10): Linear(in_features=294912, out_features=512, bias=True)
      (11): ReLU(inplace=True)
      (12): Linear(in_features=512, out_features=512, bias=True)
      (13): ReLU(inplace=True)
      (14): Dropout(p=0.2, inplace=False)
      (15): Linear(in_features=512, out_features=256, bias=True)
      (16): ReLU(inplace=True)
      (17): Linear(in_featu

In [18]:
score_sum_dict = defaultdict(lambda: [0, 0]) # file_name: (hw_confidence, typed_confidence)
score_len_dict = defaultdict(lambda: [0, 0]) # file_name: (hw_count, typed_count)

In [21]:
craft_output_dir = "/projectnb/sparkgrp/kabilanm/goodfilescraft/"

for dir_ in os.listdir(craft_output_dir):
    for file in os.listdir(os.path.join(craft_output_dir, dir_)):
        
        key = dir_.split("_")[0]
        
        img = Image.open(craft_output_dir+dir_+"/"+file)
        
        pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(device)
        encoder_outputs = model.encoder(pixel_values)
        
        image_representation = encoder_outputs.last_hidden_state

        classifier.eval()
        with torch.no_grad():
            classifier_output = classifier(image_representation.unsqueeze(1))
            
            pred_confidence = torch.sigmoid(classifier_output)
            predicted = torch.round(pred_confidence)
            
            if(predicted == 0):
                score_sum_dict[key][0] += 1-pred_confidence
                score_len_dict[key][0] += 1
            if(predicted == 1):
                score_sum_dict[key][1] += pred_confidence
                score_len_dict[key][1] += 1

In [22]:
score_sum_dict = dict(score_sum_dict)
score_len_dict = dict(score_len_dict)

In [23]:
score_avg_dict = defaultdict(lambda: [0, 0])

In [24]:
hw_score, typed_score = 0, 0

for sum_, len_ in zip(score_sum_dict.items(), score_len_dict.items()):
    if(len_[1][0] == 0):
        hw_score = 0
    elif(len_[1][1] == 0):
        typed_score = 0
    else:
        hw_score = sum_[1][0]/len_[1][0]
        typed_score = sum_[1][1]/len_[1][1]
    score_avg_dict[sum_[0]] = [hw_score, typed_score]

In [25]:
score_avg_dict = dict(score_avg_dict)
score_avg_dict

{'2452230713': [tensor([[0.9621]], device='cuda:0'),
  tensor([[0.8421]], device='cuda:0')],
 '2900445104': [tensor([[0.9422]], device='cuda:0'),
  tensor([[0.8795]], device='cuda:0')],
 '2575039168': [tensor([[1.]], device='cuda:0'),
  tensor([[0.8792]], device='cuda:0')],
 '2236018163': [tensor([[0.9824]], device='cuda:0'),
  tensor([[0.9793]], device='cuda:0')],
 '2235956175': [tensor([[0.9052]], device='cuda:0'),
  tensor([[0.9870]], device='cuda:0')],
 '2426921679': [tensor([[0.8984]], device='cuda:0'),
  tensor([[0.9489]], device='cuda:0')],
 '2859042459': [tensor([[0.8701]], device='cuda:0'),
  tensor([[0.9536]], device='cuda:0')],
 '1563240212': [tensor([[0.9861]], device='cuda:0'),
  tensor([[0.9476]], device='cuda:0')],
 '3467354375': [tensor([[1.0000]], device='cuda:0'),
  tensor([[0.9131]], device='cuda:0')],
 '1998387136': [tensor([[1.0000]], device='cuda:0'),
  tensor([[0.9457]], device='cuda:0')],
 '2265485412': [tensor([[0.9993]], device='cuda:0'),
  tensor([[0.9157]], 

In [26]:
input_dir = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/"
output_dir = "data/Doc_Classification/output/"

for file_name, avg_scores in score_avg_dict.items():
#     print(input_dir+file_name+".jpg", scores)
#     print(file_name, avg_scores)
    
    source_file = input_dir+file_name+".jpg"
    
    # Copy the file using shutil.copy2 to the corresponding directory
    # based on the average prediction score
    if(avg_scores[0] >= avg_scores[1]):
        # print("handwritten")
        shutil.copy2(source_file, os.path.join(output_dir, "handwritten"))
        
    # add some bias here
    if(avg_scores[0] < avg_scores[1]):
        # print("typed")
        shutil.copy2(source_file, os.path.join(output_dir, "typed"))

In [27]:
! ls data/Doc_Classification/output/handwritten/ | wc -l
! ls data/Doc_Classification/output/typed/ | wc -l

167
82


In [48]:
# ! rm -rf data/Doc_Classification/output/handwritten/*
# ! rm -rf data/Doc_Classification/output/typed/*