In [33]:
import os
import shutil
from collections import defaultdict
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torchvision.transforms import transforms
from torchvision.transforms.functional import pad

from utils.utils import *
from craft_text_detector import Craft

In [1]:
# --in-dir
# --out-dir
# --model-path

In [None]:
workdir = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/"
output_dir_craft = "data/Doc_Classification/input"

# initialize the CRAFT model
craft = Craft(output_dir = output_dir_craft, 
              export_extra = False, 
              text_threshold = .7, 
              link_threshold = .4, 
              crop_type="poly", 
              low_text = .3, 
              cuda = True)

# CRAFT on images to get bounding boxes
images = []
corrupted_images = []
no_segmentations = []
boxes = {}
count= 0
img_name = []
box = []
file_types = (".jpg", ".jpeg",".png")
    
for filename in tqdm(sorted(os.listdir(workdir))):
    if filename.endswith(file_types):
        image = workdir+filename
        try:
            img = Image.open(image) 
            img.verify() # Check that the image is valid
            bounding_areas = craft.detect_text(image)
            if len(bounding_areas['boxes']): #check that a segmentation was found
                images.append(image)
                boxes[image] = bounding_areas['boxes']
                
            else:
                no_segmentations.append(image)
        except (IOError, SyntaxError) as e:
            corrupted_images.append(image)

In [34]:
preprocess = transforms.Compose([
    transforms.Resize([100, ]),
    transforms.Lambda(
        lambda img: pad(img, padding=(0, 0, max(0, 200 - img.width), max(0, 100 - img.height)), 
                                              fill=(255, 255, 255))),
    transforms.CenterCrop((100, 200)),
    PartialErosion(iterations=2),
    transforms.ToTensor(),
    transforms.Grayscale(num_output_channels=3),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

In [35]:
# Move the model to the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
device

device(type='cuda')

In [36]:
# vgg_from_scratch_w_erosion_v2.pth

best_model = models.densenet121(weights="DenseNet121_Weights.IMAGENET1K_V1")
# Modify the model for binary classification
num_ftrs = best_model.classifier.in_features
best_model.classifier = nn.Linear(num_ftrs, 1)

# best_model = VGG16Binary(input_shape=(3, 100, 200), num_classes=1)  # Create an instance of the model

best_model.load_state_dict(torch.load('model/densenet_10_5_layers_unlocked.pth'))
best_model = best_model.to(device)

In [37]:
score_sum_dict = defaultdict(lambda: [0, 0]) # file_name: (hw_confidence, typed_confidence)
score_len_dict = defaultdict(lambda: [0, 0]) # file_name: (hw_count, typed_count)

In [38]:
craft_output_dir = "/projectnb/sparkgrp/kabilanm/goodfilescraft/"

for dir_ in os.listdir(craft_output_dir):
    for file in os.listdir(os.path.join(craft_output_dir, dir_)):
#         print(dir_+"/"+file)
        
        key = dir_.split("_")[0]
        
        img = Image.open(craft_output_dir+dir_+"/"+file)
        input_tensor = preprocess(img)

        # Expand dimensions
        input_tensor = torch.unsqueeze(input_tensor, 0)
        input_tensor = input_tensor.to('cuda')

        best_model.eval()
        with torch.no_grad():
            outputs = best_model(input_tensor)
            
            pred_confidence = torch.sigmoid(outputs)
            predicted = torch.round(pred_confidence)
            
            if(predicted == 0):
                score_sum_dict[key][0] += 1-pred_confidence
                score_len_dict[key][0] += 1
            if(predicted == 1):
                score_sum_dict[key][1] += pred_confidence
                score_len_dict[key][1] += 1

In [39]:
score_sum_dict = dict(score_sum_dict)
score_len_dict = dict(score_len_dict)

In [40]:
score_avg_dict = defaultdict(lambda: [0, 0])

In [41]:
hw_score, typed_score = 0, 0

for sum_, len_ in zip(score_sum_dict.items(), score_len_dict.items()):
    if(len_[1][0] == 0):
        hw_score = 0
    elif(len_[1][1] == 0):
        typed_score = 0
    else:
        hw_score = sum_[1][0]/len_[1][0]
        typed_score = sum_[1][1]/len_[1][1]
    score_avg_dict[sum_[0]] = [hw_score, typed_score]

In [42]:
score_avg_dict = dict(score_avg_dict)
score_avg_dict

{'2452230713': [tensor([[0.9985]], device='cuda:0'),
  tensor([[0.9889]], device='cuda:0')],
 '2900445104': [tensor([[0.9697]], device='cuda:0'),
  tensor([[0.9347]], device='cuda:0')],
 '2575039168': [tensor([[1.0000]], device='cuda:0'),
  tensor([[0.9626]], device='cuda:0')],
 '2236018163': [tensor([[0.9718]], device='cuda:0'),
  tensor([[0.9971]], device='cuda:0')],
 '2235956175': [tensor([[0.9574]], device='cuda:0'),
  tensor([[0.9632]], device='cuda:0')],
 '2426921679': [0, tensor([[0.9632]], device='cuda:0')],
 '2859042459': [tensor([[0.9911]], device='cuda:0'),
  tensor([[0.9875]], device='cuda:0')],
 '1563240212': [tensor([[0.9300]], device='cuda:0'),
  tensor([[0.9940]], device='cuda:0')],
 '3467354375': [tensor([[0.9843]], device='cuda:0'),
  tensor([[0.8970]], device='cuda:0')],
 '1998387136': [tensor([[0.9995]], device='cuda:0'),
  tensor([[0.9951]], device='cuda:0')],
 '2265485412': [tensor([[0.9456]], device='cuda:0'),
  tensor([[0.9668]], device='cuda:0')],
 '1998758107'

In [46]:
input_dir = "/projectnb/sparkgrp/ml-herbarium-grp/ml-herbarium-data/TROCR_Training/goodfiles/"
output_dir = "data/Doc_Classification/output/"

for file_name, avg_scores in score_avg_dict.items():
#     print(input_dir+file_name+".jpg", scores)
#     print(file_name, avg_scores)
    
    source_file = input_dir+file_name+".jpg"
    
    # Copy the file using shutil.copy2 to the corresponding directory
    # based on the average prediction score
    if(avg_scores[0] >= avg_scores[1]):
        # print("handwritten")
        shutil.copy2(source_file, os.path.join(output_dir, "handwritten"))
        
    # add some bias here
    if(avg_scores[0] < avg_scores[1]):
        # print("typed")
        shutil.copy2(source_file, os.path.join(output_dir, "typed"))

handwritten
handwritten
handwritten
typed
typed
typed
handwritten
typed
handwritten
handwritten
typed
typed
typed
typed
typed
handwritten
handwritten
handwritten
typed
handwritten
handwritten
typed
typed
handwritten
typed
handwritten
typed
typed
handwritten
handwritten
typed
typed
typed
handwritten
typed
typed
typed
typed
handwritten
typed
typed
typed
typed
typed
handwritten
typed
handwritten
handwritten
typed
typed
handwritten
handwritten
typed
handwritten
typed
typed
handwritten
typed
typed
typed
typed
handwritten
typed
handwritten
typed
handwritten
handwritten
typed
typed
typed
typed
handwritten
typed
handwritten
handwritten
typed
typed
handwritten
typed
typed
typed
typed
typed
handwritten
typed
typed
typed
typed
typed
typed
typed
typed
typed
typed
typed
handwritten
handwritten
handwritten
handwritten
handwritten
typed
typed
handwritten
typed
typed
typed
typed
typed
typed
typed
typed
handwritten
typed
typed
typed
typed
handwritten
typed
typed
handwritten
handwritten
handwritten
type

In [47]:
! ls data/Doc_Classification/output/handwritten/ | wc -l
! ls data/Doc_Classification/output/typed/ | wc -l

79
170


In [48]:
! rm -rf data/Doc_Classification/output/handwritten/*
! rm -rf data/Doc_Classification/output/typed/*