# Label Detection

## Axes Detection
This code gives a best estimate of the x and y axis (horizontal and vertical axes) for the plot/chart.

In [1]:
import cv2, imutils, re, xlsxwriter
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
from pathlib import Path
from matplotlib import rcParams
from pytesseract import Output

In [2]:
# Directory of images to run the code on
img_dir = 'test/sample'

# Directory to save the output images
save_dir = 'out'

### Method to separate horizontal and vertical lines.

In [3]:
def segmentLines(lines, threshold = 10):
    hlines, vlines = [], []
    
    for line in lines:
        for (point1x, point1y, point2x, point2y) in line:
            # If x-values are near (i.e., within a threshold),
            # then the lines are vertical. Similarly, if y-values
            # are near, then the lines are horizontal. 
            
            if abs(point2x - point1x) < threshold:
                vlines.append(line)
            elif abs(point2y - point1y) < threshold:
                hlines.append(line)
    
    return hlines, vlines

### Function to detect x-axis and y-axis for a plot

In [4]:
def detectAxes(filepath):
    if filepath is None:
        return None, None
    
    image = cv2.imread(filepath)
    height, width, channels = image.shape
        
    # Convert the image to Grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (3, 3), 0)
    gray = cv2.adaptiveThreshold(gray,
                                 maxValue = 250,
                                 adaptiveMethod = cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                 thresholdType = cv2.THRESH_BINARY,
                                 blockSize = 11,
                                 C = 10)
            
    # Canny edge detection
    edges = cv2.Canny(gray, 0, 250, apertureSize = 3)
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180,
                            threshold = 200,
                            maxLineGap = 20,
                            minLineLength = min(height, width) // 2)
    
    # No lines detected
    if lines is None:
        return None, None
        
    # Segment the lines into horizontal and vertical
    h_lines, v_lines = segmentLines(lines, 10)

    # There are either no horizontal or vertical lines, return None
    # May be pie charts or not a plot, check further later.
    if len(h_lines) == 0 or len(v_lines) == 0:
        return None, None
        
    try:         
        y1_vals = [l[0][1] for l in h_lines]
        idx_max_y1 = y1_vals.index(max(y1_vals))
        xaxis = h_lines[idx_max_y1]
            
        x1_vals = [l[0][0] for l in v_lines]
        idx_min_x1 = x1_vals.index(min(x1_vals))
        yaxis = v_lines[idx_min_x1]
        
        return xaxis, yaxis
        
    except IndexError:
        return None, None

### Here we try to get text from the image using pytesseract

In [5]:
def getTextFromImage(filepath, xaxis, yaxis, grayscale=False):
    image_text = []
    
    image = cv2.imread(filepath)
    height, width, _ = image.shape
        
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        
    # define range of black color in HSV
    lower_val = np.array([0, 0, 0])
    upper_val = np.array([179, 255, 130])

    # Threshold the HSV image to get only black colors
    mask = cv2.inRange(hsv, lower_val, upper_val)

    # Bitwise-AND mask and original image
    res = cv2.bitwise_and(image, image, mask = mask)

    # invert the mask to get black letters on white background
    image = cv2.bitwise_not(mask)
        
    #rcParams['figure.figsize'] = 15, 4
    #fig, ax = plt.subplots(1, 1) 
    #plt.imshow(image)
            
    d = pytesseract.image_to_data(image, config = "-l eng --oem 1 --psm 11", output_type = Output.DICT)
    n_boxes = len(d['text'])

    # Pick only the positive confidence boxes
    for i in range(n_boxes):
            
        if int(d['conf'][i]) >= 0:
                
            text = d['text'][i].strip()
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            image_text.append((d['text'][i], (x, y, w, h)))
     
    '''    
    image = cv2.imread(filepath)
    image_text = list(set(image_text))
    for text, (textx, texty, w, h) in image_text:
        cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 255), 2)
         
    ax[1].imshow(image, aspect = 'auto')
    '''
       
    # Remove all the duplicates in (text, box) pairs
    return list(set(image_text))

In [6]:
getTextFromImage("test/sample/1-s2.0-S0016236119323397-main-Figure1-1.png", None, None, grayscale=True)

[('1', (462, 83, 5, 9)),
 ('0.8', (37, 263, 20, 12)),
 ('product', (430, 0, 51, 15)),
 ('COU', (255, 4, 29, 8)),
 ('+', (92, 212, 16, 9)),
 ('AL', (314, 92, 22, 10)),
 ('Is.', (296, 174, 19, 9)),
 ('50.6', (29, 9, 28, 12)),
 ('RS', (325, 283, 18, 12)),
 ('10.0', (31, 213, 26, 11)),
 ('20.0', (29, 162, 28, 11)),
 ('3', (0, 99, 12, 11)),
 ('40.0', (29, 60, 28, 12)),
 ('30.0', (29, 111, 28, 12)),
 ('cs', (118, 283, 19, 12)),
 ('3LI', (248, 91, 20, 10)),
 ('Solid', (169, 0, 34, 12)),
 ('L', (153, 67, 6, 10)),
 ('36.8', (453, 61, 23, 15)),
 ('Iss', (399, 170, 21, 10)),
 ('400', (144, 44, 24, 14)),
 ('PLOGUCT', (290, 0, 51, 15)),
 ('34.0', (350, 78, 24, 10)),
 ('ss', (428, 283, 17, 12)),
 ('I', (405, 184, 7, 9)),
 ('243', (414, 122, 23, 10)),
 ('199', (230, 150, 22, 9)),
 ('Aqueous', (366, 1, 58, 14)),
 ('197', (333, 151, 22, 9)),
 ('I', (115, 80, 7, 19)),
 ('M4', (211, 73, 22, 10)),
 ('PS', (223, 283, 16, 12)),
 ('Gi', (228, 1, 10, 11)),
 ('Ins', (127, 167, 21, 9)),
 ('lt', (192, 176, 21, 1

### This function is to calculate the probable x-labels, y-labels and legend text

Here is the logic for the same

1. X-labels

    a. Check only the text boxes which are below the x-axis(, and to the right of y-axis).
    
    b. Run a line sweep from x-axis to the bottom of the image, and check when the sweeping line intersects with the maximum            number of text boxes.
    
    c. This maximum intersection gives all the x-labels



2. Y-labels:

    a. Check only the text boxes which are to the left of y-axis(, and to the top of x-axis).
    
    b. Run a line sweep from left end of the image to the y-axis position, and check when the sweeping line intersects with the        maximum number of text boxes.
    
    c. This maximum intersection gives all the y-labels


    
3. Legend text:
    
    a. Check only the text boxes in the remaining region (right to the y-axis and top of x-axis)
    
    b. Only Consider text boxes which have non-numeric text
    
    c. Run the sweeping line algorithm twice now - Once in the x-direction and the second time in the y-direction
    
    d. The maximum intersections gives all the legend texts.

In [7]:
def getProbableLabels(image, image_text, xaxis, yaxis):
    y_labels = []
    x_labels = []
    legends = []
    
    height, width, channels = image.shape
    
    for text, (textx, texty, w, h) in image_text:
        text = text.strip()
                    
        (x1, y1, x2, y2) = xaxis
        (x11, y11, x22, y22) = yaxis
            
        # To the left of y-axis and top of x-axis
        if (np.sign((x2 - x1) * (texty - y1) - (y2 - y1) * (textx - x1)) == -1 and
            np.sign((x22 - x11) * (texty - y11) - (y22 - y11) * (textx - x11)) == -1):
            y_labels.append((text, (textx, texty, w, h)))
            
        # To the right of y-axis and bottom of x-axis
        elif (np.sign((x2 - x1) * (texty - y1) - (y2 - y1) * (textx - x1)) == 1 and
            np.sign((x22 - x11) * (texty - y11) - (y22 - y11) * (textx - x11)) == 1):
            x_labels.append((text, (textx, texty, w, h)))
            
        # Top of x-axis and to the right of y-axis
        elif (np.sign((x2 - x1) * (texty - y1) - (y2 - y1) * (textx - x1)) == -1 and
            np.sign((x22 - x11) * (texty - y11) - (y22 - y11) * (textx - x11)) == 1):
            
            # Consider non-numeric only for legends
            if not bool(re.findall(r'\b[\d\.\d]+\b', text)):
                legends.append((text, (textx, texty, w, h)))
            
    # Get the y-labels by finding the maximum
    # intersections with the sweeping line
    maxIntersection = 0
    maxList = []
    for i in range(x11):
        count = 0
        current = []
        for index, (text, rect) in enumerate(y_labels):
            if lineIntersectsRectX(i, rect):
                count += 1
                current.append(y_labels[index])
                            
        if count > maxIntersection:
            maxIntersection = count
            maxList = current
        
    y_labels = []
    for text, (textx, texty, w, h) in maxList:
        y_labels.append(text)
        #cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 0), 2)
        
    # Get the x-labels by finding the maximum
    # intersections with the sweeping line
    maxIntersection = 0
    maxList = []
    for i in range(y1, height):
        count = 0
        current = []
        for index, (text, rect) in enumerate(x_labels):
            if lineIntersectsRectY(i, rect):
                count += 1
                current.append(x_labels[index])
                            
        if count > maxIntersection:
            maxIntersection = count
            maxList = current
    
    # Sort bounding rects by x coordinate
    def getYFromRect(item):
        return item[1]

    maxList.sort(key = getYFromRect)
    
    x_labels = []
    for text, (textx, texty, w, h) in maxList:
        x_labels.append(text)
        cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 0), 2)
    
    # Get possible legend text
    # For this, we need to search both top to
    # bottom and also from left to right.
    maxIntersection = 0
    maxList = []
    for i in range(y1):
        count = 0
        current = []
        for index, (text, rect) in enumerate(legends):
            if lineIntersectsRectY(i, rect):
                count += 1
                current.append(legends[index])
                            
        if count > maxIntersection:
            maxIntersection = count
            maxList = current
            
    for i in range(x11, width):
        count = 0
        current = []
        for index, (text, rect) in enumerate(legends):
            if lineIntersectsRectX(i, rect):
                count += 1
                current.append(legends[index])
                            
        if count > maxIntersection:
            maxIntersection = count
            maxList = current
        
    legends = []
    legendBoxes = []
    for text, (textx, texty, w, h) in maxList:
        legends.append(text)
        legendBoxes.append((textx, texty, w, h))
        #cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 255), 2)
    
    legendBoxes = mergeRects(legendBoxes)
    
    for (textx, texty, w, h) in legendBoxes:
        cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 255), 2)
    
    print("number of clusters : ", len(legendBoxes))
    
    return image, x_labels, y_labels, legends

In [8]:
def lineIntersectsRectX(candx, rect):
    (x, y, w, h) = rect
    
    if x <= candx <= x + w:
        return True
    else:
        return False
    
def lineIntersectsRectY(candy, rect):
    (x, y, w, h) = rect
    
    if y <= candy <= y + h:
        return True
    else:
        return False

In [9]:
def getTextFromImageArray(image, mode):
    image_text = []
    
    if mode == 'y-text':
        image = cv2.transpose(image)
        image = cv2.flip(image, flipCode = 1)
        config = "-l eng --oem 1 --psm 6"
    elif mode == 'y-labels':
        config = "-l eng --oem 1 --psm 6 -c tessedit_char_whitelist=.0123456789"
    
    d = pytesseract.image_to_data(image, config = config, output_type = Output.DICT)
    
    n_boxes = len(d['text'])

    # Pick only the positive confidence boxes
    for i in range(n_boxes):
            
        if int(d['conf'][i]) >= 0:
                
            text = d['text'][i].strip()
            
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            image_text.append((d['text'][i], (x, y, w, h)))
            
    # Remove all the duplicates in (text, box) pairs
    return list(set(image_text))

In [10]:
def maskImageForwardPass(filepath, start_idx):
    if path.name.endswith('.png') or path.name.endswith('.jpg') or path.name.endswith('.jpeg'):

        filepath = img_dir + "/" + path.name
        image = cv2.imread(filepath)
        height, width, channels = image.shape
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        start_idx = 1
        while start_idx <= width:
            if sum(gray[:, start_idx] < 200) != 0:
                break
            else:
                start_idx += 1
                
        end_idx = start_idx
        while end_idx <= width:
            if sum(gray[:, end_idx] < 200) == 0:
                break
            else:
                end_idx += 1
                
        gray[:, 1:start_idx] = 255
        gray[:, end_idx:width] = 255
        
        return gray, start_idx, end_idx

In [11]:
def maskImageBackwardPass(filepath, start_idx, end_idx):
    if path.name.endswith('.png') or path.name.endswith('.jpg') or path.name.endswith('.jpeg'):

        filepath = img_dir + "/" + path.name
        image = cv2.imread(filepath)
        height, width, channels = image.shape

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        while end_idx > start_idx:
            if sum(gray[:, end_idx] < 200) == 0:
                break
            else:
                end_idx -= 1
        
        gray[:, 1:start_idx] = 255
        gray[:, end_idx:width] = 255
        
        return gray

## Writing to Excel workbook

In [12]:
def addToExcel(dataname, data, row):
    col = 0

    worksheet.write(row, col, dataname)
    for content in data:
        col += 1
        worksheet.write(row, col, content)

In [13]:
def nearbyRectangle(current, candidate, threshold):
    (currx, curry, currw, currh) = current
    (candx, candy, candw, candh) = candidate
    
    currxmin = currx
    currymin = curry
    currxmax = currx + currw
    currymax = curry + currh
    
    candxmin = candx
    candymin = candy
    candxmax = candx + candw
    candymax = candy + candh
    
    # If candidate is on top, and is close
    if candymax <= currymin and candymax + threshold >= currymin:
        return True
    
    # If candidate is on bottom and is close
    if candymin >= currymax and currymax + threshold >= candymin:
        return True
    
    # If intersecting at the top, merge it
    if candymax >= currymin and candymin <= currymin:
        return True
    
    # If intersecting at the bottom, merge it
    if currymax >= candymin and currymin <= candymin:
        return True
    
    # If intersecting on the sides or is inside, merge it
    if (candymin >= currymin and
        candymin <= currymax and
        candymax >= currymin and
        candymax <= currymax):
        return True
    
    return False

In [14]:
def mergeRects(contours):
    rects = []
    rectsUsed = []

    # Just initialize bounding rects and set all bools to false
    for cnt in contours:
        rects.append(cnt)
        #rects.append(cv2.boundingRect(cnt))
        rectsUsed.append(False)

    # Sort bounding rects by x coordinate
    def getXFromRect(item):
        return item[0]

    rects.sort(key = getXFromRect)

    # Array of accepted rects
    acceptedRects = []

    # Merge threshold for x coordinate distance
    xThr = 10
    yThr = 5

    # Iterate all initial bounding rects
    for supIdx, supVal in enumerate(rects):
        if (rectsUsed[supIdx] == False):

            # Initialize current rect
            currxMin = supVal[0]
            currxMax = supVal[0] + supVal[2]
            curryMin = supVal[1]
            curryMax = supVal[1] + supVal[3]

            # This bounding rect is used
            rectsUsed[supIdx] = True

            # Iterate all initial bounding rects
            # starting from the next
            for subIdx, subVal in enumerate(rects[(supIdx+1):], start = (supIdx+1)):

                # Initialize merge candidate
                candxMin = subVal[0]
                candxMax = subVal[0] + subVal[2]
                candyMin = subVal[1]
                candyMax = subVal[1] + subVal[3]

                # Check if x distance between current rect
                # and merge candidate is small enough
                if (candxMin <= currxMax + xThr):

                    if not nearbyRectangle((candxMin, candyMin, candxMax - candxMin, candyMax - candyMin),
                                           (currxMin, curryMin, currxMax - currxMin, curryMax - curryMin), yThr):
                        break

                    # Reset coordinates of current rect
                    currxMax = candxMax
                    curryMin = min(curryMin, candyMin)
                    curryMax = max(curryMax, candyMax)

                    # Merge candidate (bounding rect) is used
                    rectsUsed[subIdx] = True
                else:
                    break

            # No more merge candidates possible, accept current rect
            acceptedRects.append([currxMin, curryMin, currxMax - currxMin, curryMax - curryMin])

    #for rect in acceptedRects:
    #    img = cv2.rectangle(img, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (121, 11, 189), 2)
    
    return acceptedRects

In [15]:
import json

with open('urldata.json') as handler:
    urldata = json.load(handler)

In [None]:
workbook = xlsxwriter.Workbook('FigureData.xlsx')

for path in Path(img_dir).iterdir():
    if path.name.endswith('.png') or path.name.endswith('.jpg') or path.name.endswith('.jpeg'):

        filepath = img_dir + "/" + path.name
        image = cv2.imread(filepath)
        height, width, channels = image.shape
        xaxis, yaxis = detectAxes(filepath)
        y_text, y_labels = [], []
        
        if xaxis is None or yaxis is None:
            continue
        
        for line in [xaxis]:
            for (x1, y1, x2, y2) in line:
                xaxis = (x1, y1, x2, y2)

        for line in [yaxis]:
            for (x1, y1, x2, y2) in line:
                yaxis = (x1, y1, x2, y2)
        
        rcParams['figure.figsize'] = 15, 4
        fig, ax = plt.subplots(1, 3)
        
        gray, start_idx, end_idx = maskImageForwardPass(filepath, 1)
        y_text_list = getTextFromImageArray(gray, 'y-text')
        
        # Sort bounding rects by x coordinate
        def getXFromRect(item):
            return item[1][0]
        
        y_text_list.sort(key = getXFromRect)        
        for text, (textx, texty, w, h) in y_text_list:
            y_text.append(text)
        
        ax[0].imshow(gray, aspect = 'auto')
        
        gray = maskImageBackwardPass(filepath, end_idx, yaxis[0])
        y_labels_list = getTextFromImageArray(gray, 'y-labels')
        
        # Sort bounding rects by y coordinate
        def getYFromRect(item):
            return item[1][1]

        y_labels_list.sort(key = getYFromRect)
        for text, (textx, texty, w, h) in y_labels_list:
            y_labels.append(text)
            
        image_text = getTextFromImage(filepath, xaxis, yaxis, grayscale=False)
        image, x_labels, _, legends = getProbableLabels(image, image_text, xaxis, yaxis)
        
        # Write to Excel
        worksheet = workbook.add_worksheet()
        addToExcel("caption", [urldata[path.name]], 0)
        addToExcel("x-labels", x_labels, 1)
        addToExcel("y-text", y_text, 2)
        addToExcel("y-labels", y_labels, 3)
        addToExcel("legends", legends, 4)
        
        # Print the output here!
        print("file name    :  ", path.name)
        print("x-labels     :  ", x_labels)
        print("y-text       :  ", y_text)
        print("y-labels     :  ", y_labels)
        print("legends      :  ", legends, end = "\n\n")
        
        for (x1, y1, x2, y2) in [xaxis]:
            cv2.line(image, (x1, y1), (x2, y2),  (0, 0, 255), 2)
            
        for (x1, y1, x2, y2) in [yaxis]:
            cv2.line(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
        for text, (textx, texty, w, h) in y_labels_list:
            cv2.rectangle(image, (textx, texty), (textx + w, texty + h), (255, 0, 255), 2)
        
        ax[1].imshow(gray, aspect = 'auto')
        ax[2].imshow(image, aspect = 'auto')
        
        # Insert the image
        plt.savefig(path.name)
        worksheet.insert_image('B8', path.name)
        
# Close the excel workbook!
workbook.close()

number of clusters :  5
file name    :   1-s2.0-S0016236119323397-main-Figure1-1.png
x-labels     :   ['cs', 'PS', 'RS', 'ss']
y-text       :   ['‘Yields', "(wt'/o)"]
y-labels     :   ['50.0', '40.0', '30.0', '20.0', '10.0', '0.0']
legends      :   ['product', 'COU', 'Solid', 'PLOGUCT', 'Aqueous', 'Gi', 'Bio-oll']

