In [88]:
# This note is the implementation of past research regarding segmentation-based metrics

from PIL import Image
import base64
import cv2
import csv
from io import StringIO
from io import BytesIO
import numpy as np
from skimage import util, color
import math

from skimage import img_as_ubyte
from skimage.filters import rank
from skimage.morphology import  disk
from skimage.feature import canny
import scipy.spatial as spatial


In [89]:
infiles = [
#'999*ted.com',
'0*theboneandjointcenter.com',
#'1*mozilla.org',
'2*two-n.com',
#'3*segmentfault.com',
#'4*makeitok.org',
'5*disney.co.jp',
'6*wikipedia.org',
'7*news.yahoo.co.jp',
'8*huxiu.com',
#'9*fangdd.com',
'10*cheshi.com',
'11*humblebundle.com',
'12*theatlantic.com',
#'13*superprof.fr',
'14*microsoft.com',
'15*opera.com',
'16*labinthewild.org',
#'17*nounplus.net',
'18*richyli.com',
'19*pxtoem.com', 
#'20*goofy.photo',
'21*javadrive.jp',
#'22*jcodecraeer.com',
#'23*hdpfans.com',
'24*jiqimao.tv',
'25*clamav.net',
'26*bootcdn.cn',
'27*runoob.com',
'28*tensorfly.cn',
'29*journaldugeek.com', 
'30*matetranslate.com',
'31*kameisyouten.ocnk.net',
'32*gingerweb.jp',
'33*cp.pocky.jp',
'34*aladdin-aic.com',
#'35*ho-ginza.net',
#'36*pandayori.com',
#'37*sekimoto.dental',
'38*kokage-m.com',
#'39*coming-saji.com',
#'40*bluewood.bitter.jp',
'41*steakland.jp',
'42*showroomprive.com',
'43*imas-cg.net',
'44*filetender.com',
'45*hexo.io',
'46*yinwang.org',
'47*blog.yitianshijie.net',
'48*yatani.jp',
'49*qiita.com',
'50*52nlp.cn',
'51*guidetojapanese.org',
'52*olderadults.mobi',
'53*blog.whatsapp.com',
#'54*stratechery.com',
'55*lomake.fi',
#'56*theclinic.cl',
'57*bgmaimuna.com',
'58*0dt.net',
'59*web.ics.purdue.edu',
'60*canon-foundation.jp',
#'61*docs.opencv.org',
#'62*interaction-design.org',
#'63*jlpt.jp',
'64*blog.sciencenet.cn',
'65*pantone.com',
#'66*sdtech.co.jp',
'67*news.livedoor.com',
'68*gmo.jp',
'69*tokai-tv.com', 
'70*life-is-tech.com',
'71*bloomberg.co.jp',
'72*cerezo.jp',
'73*tech.nikkeibp.co.jp',
'74*jp.techcrunch.com',
'75*capcom.co.jp',
#'76*kenkun-jinja.org',
'77*sankei.com',
'78*tech-jp.co.jp',
'79*tech-camp.in',
'80*hasegawa-heart.com',
'81*techplay.jp',
#'82*ubejinja.or.jp',
#'83*19lou.com',
#'84*jiankang.com',
'85*cps.com.cn',
#'86*hea.cn',
#'87*jfc.or.jp',
'88*trafst.jp',
'89*infoq.com',
#'90*secure.j-bus.co.jp',
'91*jcr.incites.thomsonreuters.com',
'92*dna.fr',
'93*macg.co',
'94*cvpr2018.thecvf.com',
'95*chi2019.acm.org',
'96*swellnet.com',
'97*klex.ru',
#'98*kudago.com',
'99*theborneopost.com'
]

In [90]:
def read_b64_img(b64):
    img = base64.b64decode(b64)
    npimg = np.fromstring(img, dtype=np.uint8)
    return cv2.imdecode(npimg, 1)

class BBox(object):
    def __init__(self, x1, y1, x2, y2):
        # (x1, y1) is the upper left corner,
        # (x2, y2) is the lower right corner,  
        if x1 > x2: x1, x2 = x2, x1
        if y1 > y2: y1, y2 = y2, y1
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2

    def taxicab_diagonal(self):
        return self.x2 - self.x1 + self.y2 - self.y1

    def overlaps(self, other):
        # Return True if self and other overlap.        
        return not ((self.x1 > other.x2) or (self.x2 < other.x1) or (self.y1 > other.y2) or (self.y2 < other.y1))

def makeClassFromCont(contours):
    bboxes = []
    for c in contours:
        x,y,w,h = cv2.boundingRect(c)
        if w < 5 or h < 5: 
            continue
        bboxes.append(BBox(x, y, x+w, y+h))
    return bboxes

def remove_overlaps(contours):
    #This function returns a set of bboxes after removing the overlapping contours
    bboxes = makeClassFromCont(contours)
    
    corners = []
    ulcorners = []

    # dict mapping corners to Bboxes.
    bbox_map = {}

    for bbox in bboxes:
        ul = (bbox.x1, bbox.y1)
        lr = (bbox.x2, bbox.y2)
        bbox_map[ul] = bbox
        bbox_map[lr] = bbox
        ulcorners.append(ul)
        corners.append(ul)
        corners.append(lr)        

    try:
        tree = spatial.KDTree(np.asarray(corners))
    except Exception:
        return 0
    for corner in ulcorners:
        bbox = bbox_map[corner]
        # Find all points which are within a taxicab distance of corner
        indices = tree.query_ball_point(corner, bbox_map[corner].taxicab_diagonal(), p = 1)
        for near_corner in tree.data[indices]:
            near_bbox = bbox_map[tuple(near_corner)]
            if bbox != near_bbox and bbox.overlaps(near_bbox):
                # Expand both the bboxes
                bbox.x1 = near_bbox.x1 = min(bbox.x1, near_bbox.x1)
                bbox.y1 = near_bbox.y1 = min(bbox.y1, near_bbox.y1) 
                bbox.x2 = near_bbox.x2 = max(bbox.x2, near_bbox.x2)
                bbox.y2 = near_bbox.y2 = max(bbox.y2, near_bbox.y2) 
    return set(bbox_map.values())

In [91]:
def get_elements(b64, detailed=True, preview=True):
    img_bgr = read_b64_img(b64)
    img_out = np.copy(img_bgr)

    contours_all_v, contours_all_h = segment(img_bgr, h_blur=13, v_blur=9)

    thickness = 1
    if detailed:
        contours_all = contours_all_h
    else:
        contours_all = contours_all_v
        thickness = 2

    offset, offset1 = 3, 5

    elements = []
    N = len(contours_all)
    for i, c in zip(range(N), contours_all):
        x, y, w, h = cv2.boundingRect(c)

        if w <= 15:
            continue
        if h <= 10:
            continue

        img_out = cv2.rectangle(img_out, (x, y), (x + w, y + h), (0, 0, 255), thickness)

        ele_b64 = base64.b64encode(cv2.imencode(".png", img_bgr[y:y+h, x:x+w])[1])

        elements.append({
            "id": i,
            "tag": "",
            "x_position": x,
            "y_position": y,
            "width": w,
            "height": h,
            "b64": ele_b64
        })

    result = {
        "elements": elements
    }

    if preview:
        b64 = base64.b64encode(cv2.imencode(".png", img_out)[1])
        result["preview"] = b64

    return result

def segment(img_bgr, h_blur=13, v_blur=9):
    BW = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(BW, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    denoised = rank.median(BW, disk(5))
    gradient_denoised = rank.gradient(denoised, disk(1))

    gradient_0 = rank.gradient(img_bgr[:, :, 0], disk(1))
    gradient_1 = rank.gradient(img_bgr[:, :, 1], disk(1))
    gradient_2 = rank.gradient(img_bgr[:, :, 2], disk(1))

    sobelx64f = cv2.Sobel(BW, cv2.CV_64F, 1, 0, ksize=5)
    abs_sobel64f = np.absolute(sobelx64f)
    sobel_8u = np.uint8(abs_sobel64f)
    img_canny = canny(BW)

    _, contours_thresh, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_0, _ = cv2.findContours(gradient_0, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_1, _ = cv2.findContours(gradient_1, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_2, _ = cv2.findContours(gradient_2, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_denoised, _ = cv2.findContours(gradient_denoised, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_sobel, _ = cv2.findContours(sobel_8u, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    _, contours_canny, _ = cv2.findContours(img_as_ubyte(img_canny), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

    contours = contours_0 + contours_1 + contours_2 + contours_denoised + contours_sobel + contours_canny

    # bbox = utils.remove_overlaps(contours)
    bbox = 0 # No big bounding box

    temp = np.zeros_like(BW)

    if bbox != 0:
        for bb in bbox:
            temp = cv2.rectangle(temp, (bb.x1, bb.y1), (bb.x2, bb.y2), (255, 255, 255), 1)

    for c in contours_thresh:
        x, y, w, h = cv2.boundingRect(c)
        temp = cv2.rectangle(temp, (x, y), (x + w, y + h), (255, 255, 255), 1)

    # Horizontal Blurring filter
    size = h_blur # 11
    kmb = np.zeros((size, size))
    kmb[int(size / 2), :] = np.ones(size)
    kmb = kmb/size

    # Apply horizontal blurring here
    temp = cv2.filter2D(temp, -1, kmb)
    _, contours_all_h, _ = cv2.findContours(temp, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

    # Vertical Blurring filter
    size = v_blur # 13
    kmb = np.zeros((size, size))
    kmb[:, int(size / 2)] = np.ones(size)
    kmb = kmb/size

    # Apply vertical blurring here
    temp = cv2.filter2D(temp, -1, kmb)
    _, contours_all_v, _ = cv2.findContours(temp, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

    return contours_all_v, contours_all_h


In [92]:
class TestSegmentation(object):
    def setUp(self, b64):
        self.elements = get_elements(b64, detailed=True)
        
        #self.elements = segmentation.get_elements(test_data.load_test_data(), detailed=False)
        
    def test_returns_list(self):
        self.assertTrue(isinstance(self.elements, list))

    def test_list_contains_dicts(self):
        for element in self.elements: 
            self.assertTrue(isinstance(element, dict))

    def test_elements_are_valid(self):
        for element in self.elements:
            self.assertTrue('id' in element)
            self.assertTrue('tag' in element)
            self.assertTrue('x_position' in element)
            self.assertTrue('y_position' in element)
            self.assertTrue('width' in element)
            self.assertTrue('height' in element)

            self.assertTrue(isinstance(element['id'], int))
            self.assertTrue(isinstance(element['tag'], basestring))
            self.assertTrue(isinstance(element['x_position'], int))
            self.assertTrue(isinstance(element['y_position'], int))
            self.assertTrue(isinstance(element['width'], int))
            self.assertTrue(isinstance(element['height'], int))

In [93]:
def white_space(b64, elements):    
    b64 = base64.b64decode(b64)
    b64 = BytesIO(b64)
    img = Image.open(b64)

    width, height = img.size

    imsize = width * height

    non_white_space = 0
    for ele in elements:
        type(ele['width'])
        non_white_space += ele['width'] * ele['height']

    return (imsize - non_white_space) / float(imsize)

In [95]:
def grid_quality(b64, seg_elements):
    # Calculating the number of elements
    num_element = len(seg_elements)

    # Converting json file to the python's dictionary format
    data_dict = {'pos_X': [0 for _ in range(num_element)], 'pos_Y': [0 for _ in range(num_element)], 'Shapes_W': [0 for _ in range(num_element)], 'Shapes_H': [0 for _ in range(num_element)]}
    for i in range(num_element):
        data_dict['pos_X'][i] = seg_elements[i]["x_position"]
        data_dict['pos_Y'][i] = seg_elements[i]["y_position"]
        data_dict['Shapes_W'][i] = seg_elements[i]["width"]
        data_dict['Shapes_H'][i] = seg_elements[i]["height"]

    # A function to return the number of alignment lines in one dimension
    def alignment(dna1, dna2):
        data = dna1 + dna2

        # Calculating the number of elements
        align = len(set(data))

        return align


    # Finding end point of each element
    end_x = [0 for _ in range(num_element)]
    end_y = [0 for _ in range(num_element)]
    for i in range(num_element):
        end_x[i] = np.sum([data_dict['pos_X'][i], data_dict['Shapes_W'][i]], axis=0)
        end_y[i] = np.sum([data_dict['pos_Y'][i], data_dict['Shapes_H'][i]], axis=0)

    # Calculating the number of alignment lines based on the  and horizontally
    num_align_x = alignment(data_dict['pos_X'], end_x)
    num_align_y = alignment(data_dict['pos_Y'], end_y)

    # Total number of alignment lines
    fit_align = sum([num_align_x, num_align_y])

    return fit_align


In [98]:
white_space_list = []
grid_quality_list = []

def main():
    for i in infiles:
        no, title = i.split('*')
        with open('webpages/'+ title +'.png', 'rb') as f:
            im = f.read()
            b64 = base64.b64encode(im)
            ts = TestSegmentation()
            ts.setUp(b64)
            
        result = grid_quality(b64, ts.elements['elements'])
        
        thisItem = [title, result]
        print(thisItem)
        
        grid_quality_list.append(thisItem)

    with open('grid_quality.csv', 'w') as  f:
        writer = csv.writer(f)
        writer.writerow(['title', 'grid_quality'])
        writer.writerows(grid_quality_list)

In [99]:
main()

  This is separate from the ipykernel package so we can avoid doing imports until


['theboneandjointcenter.com', 71]
['two-n.com', 141]
['disney.co.jp', 94]
['wikipedia.org', 141]
['news.yahoo.co.jp', 182]
['huxiu.com', 159]
['cheshi.com', 251]
['humblebundle.com', 53]
['theatlantic.com', 122]
['microsoft.com', 55]
['opera.com', 75]
['labinthewild.org', 146]
['richyli.com', 98]
['pxtoem.com', 8]
['javadrive.jp', 128]
['jiqimao.tv', 167]
['clamav.net', 54]
['bootcdn.cn', 54]
['runoob.com', 145]
['tensorfly.cn', 57]
['journaldugeek.com', 108]
['matetranslate.com', 42]
['kameisyouten.ocnk.net', 113]
['gingerweb.jp', 59]
['cp.pocky.jp', 68]
['aladdin-aic.com', 115]
['kokage-m.com', 154]
['steakland.jp', 4]
['showroomprive.com', 71]
['imas-cg.net', 56]
['filetender.com', 100]
['hexo.io', 50]
['yinwang.org', 56]
['blog.yitianshijie.net', 98]
['yatani.jp', 98]
['qiita.com', 62]
['52nlp.cn', 126]
['guidetojapanese.org', 61]
['olderadults.mobi', 95]
['blog.whatsapp.com', 58]
['lomake.fi', 65]
['bgmaimuna.com', 129]
['0dt.net', 103]
['web.ics.purdue.edu', 17]
['canon-foundatio