In [128]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pytesseract
from sklearn.feature_extraction.text import CountVectorizer

In [125]:
##############
# file paths #
##############

path_fiveg_images = "../dataset/tweets/hydrated/5G/images/"
path_nocon_images = "../dataset/tweets/hydrated/Non/images/"
path_other_images = "../dataset/tweets/hydrated/Other/images/"
path_test_images = "../dataset/tweets/hydrated/Test/images/"

In [3]:
#######################
# cv helper functions #
#######################

# from https://nanonets.com/blog/ocr-with-tesseract/
# see also https://github.com/bloomberg/scatteract/blob/master/tesseract.py

def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

def canny(image):
    return cv2.Canny(image, 100, 200)

def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

In [98]:
#############################
# create english dictionary #
#############################

# from https://inventwithpython.com/hacking/chapter12.html

dictionary = []

with open('./output/dictionary.txt', 'r') as f:
    dictionary.extend(f.read().split('\n'))

# add special terms
dictionary.extend(['5G','COVID','CORONAVIRUS','COVID19','COVID-19','19'])

In [None]:
########################
# get text from images #
########################

img_text = dict()

# config options:
#  oem 3 - use best available engine
#  psm 12 - greedy grab sparse text
custom_config = r'--oem 3 --psm 12'

# tokenizer used in with nlp classifier
tokenize = CountVectorizer().build_tokenizer()

# output image text to csv
f = open('image_terms_test.csv','w')

# headers
f.write('filename,terms\n')

for filename in os.listdir(path_test_images):
    path = os.path.join(path_test_images, filename)
    if os.path.isfile(path):
        
        # print(f'\n--- {path} ---\n')
        text = ""
        
        # we want the best processing possible per image, so 
        # we try multiple types of processing
        
        orig_img = cv2.imread(path)
        text = pytesseract.image_to_string(img, config=custom_config)
        
        img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2GRAY)
        text += pytesseract.image_to_string(img, config=custom_config)
        
        try:
            img = deskew(img)
            text += pytesseract.image_to_string(img, config=custom_config)
        except Exception:
            pass
        
        img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        text += pytesseract.image_to_string(img, config=custom_config)
        
        text = text.upper()
        tokens = tokenize(text)
        
        # creating a set of words, will lose repeats
        english_words = set([x.lower() for x in tokens if x in dictionary])
        
        #plt.figure()
        #plt.imshow(orig_img) 
        #plt.show()
        
        #print(img_text[filename])
        terms = ' '.join(english_words)
        
        f.write(f'{filename},{terms}\n')
        
f.close()