In [3]:
# This note is the implementation of past research regarding QuadTree metrics

from skimage import util, color
import numpy as np
import base64
from PIL import Image
from io import BytesIO
import math
import cv2
import csv

In [4]:
infiles = [
#'999*ted.com',
'0*theboneandjointcenter.com',
#'1*mozilla.org',
'2*two-n.com',
#'3*segmentfault.com',
#'4*makeitok.org',
'5*disney.co.jp',
'6*wikipedia.org',
'7*news.yahoo.co.jp',
'8*huxiu.com',
#'9*fangdd.com',
'10*cheshi.com',
'11*humblebundle.com',
'12*theatlantic.com',
#'13*superprof.fr',
'14*microsoft.com',
'15*opera.com',
'16*labinthewild.org',
#'17*nounplus.net',
'18*richyli.com',
'19*pxtoem.com', 
#'20*goofy.photo',
'21*javadrive.jp',
#'22*jcodecraeer.com',
#'23*hdpfans.com',
'24*jiqimao.tv',
'25*clamav.net',
'26*bootcdn.cn',
'27*runoob.com',
'28*tensorfly.cn',
'29*journaldugeek.com', 
'30*matetranslate.com',
'31*kameisyouten.ocnk.net',
'32*gingerweb.jp',
'33*cp.pocky.jp',
'34*aladdin-aic.com',
#'35*ho-ginza.net',
#'36*pandayori.com',
#'37*sekimoto.dental',
'38*kokage-m.com',
#'39*coming-saji.com',
#'40*bluewood.bitter.jp',
'41*steakland.jp',
'42*showroomprive.com',
'43*imas-cg.net',
'44*filetender.com',
'45*hexo.io',
'46*yinwang.org',
'47*blog.yitianshijie.net',
'48*yatani.jp',
'49*qiita.com',
'50*52nlp.cn',
'51*guidetojapanese.org',
'52*olderadults.mobi',
'53*blog.whatsapp.com',
#'54*stratechery.com',
'55*lomake.fi',
#'56*theclinic.cl',
'57*bgmaimuna.com',
'58*0dt.net',
'59*web.ics.purdue.edu',
'60*canon-foundation.jp',
#'61*docs.opencv.org',
#'62*interaction-design.org',
#'63*jlpt.jp',
'64*blog.sciencenet.cn',
'65*pantone.com',
#'66*sdtech.co.jp',
'67*news.livedoor.com',
'68*gmo.jp',
'69*tokai-tv.com', 
'70*life-is-tech.com',
'71*bloomberg.co.jp',
'72*cerezo.jp',
'73*tech.nikkeibp.co.jp',
'74*jp.techcrunch.com',
'75*capcom.co.jp',
#'76*kenkun-jinja.org',
'77*sankei.com',
'78*tech-jp.co.jp',
'79*tech-camp.in',
'80*hasegawa-heart.com',
'81*techplay.jp',
#'82*ubejinja.or.jp',
#'83*19lou.com',
#'84*jiankang.com',
'85*cps.com.cn',
#'86*hea.cn',
#'87*jfc.or.jp',
'88*trafst.jp',
'89*infoq.com',
#'90*secure.j-bus.co.jp',
'91*jcr.incites.thomsonreuters.com',
'92*dna.fr',
'93*macg.co',
'94*cvpr2018.thecvf.com',
'95*chi2019.acm.org',
'96*swellnet.com',
'97*klex.ru',
#'98*kudago.com',
'99*theborneopost.com'
]

In [5]:
def equilibrium(leaves, width, height):
    area = []
    dx = []
    dy = []
    for leaf in leaves:
        area.append(float(leaf[2]) * float(leaf[3]))
        dx.append(abs(float(leaf[0] + leaf[2] / 2) - float(width) / 2))
        dy.append(abs(float(leaf[1] + leaf[3] / 2) - float(height) / 2))

    sum_x = 0.0
    sum_y = 0.0
    for n in range(len(dx)):
        sum_x += area[n] * dx[n]
        sum_y += area[n] * dx[n]
    EM_x = (2 * sum_x) / (int(width) * len(leaves) * sum(area))
    EM_y = (2 * sum_y) / (int(height) * len(leaves) * sum(area))

    EM = 1 - float(abs(EM_x) + abs(EM_y)) / 2

    return EM

In [6]:
## -------------- QT functions ---------------##
def balance(leaves, width, height):
    top = []
    right = []
    left = []
    bottom = []

    for leaf in leaves:
        if leaf[0] > width / 2:
            right.append(leaf)
        else:
            left.append(leaf)
        if leaf[1] > height / 2:
            bottom.append(leaf)
        else:
            top.append(leaf)

    w_left = 0.0
    w_top = 0.0
    w_bottom = 0.0
    w_right = 0.0
    center = [width / 2, height / 2]

    for leaf in top:
        area = leaf[2] * leaf[3]
        mid_point_leaf = [leaf[0] + leaf[2] / 2, leaf[1] + leaf[3] / 2]
        distance = abs(mid_point_leaf[0] - center[1])
        score = distance * area
        w_top += score
    for leaf in bottom:
        area = leaf[2] * leaf[3]
        mid_point_leaf = [leaf[0] + leaf[2] / 2, leaf[1] + leaf[3] / 2]
        distance = abs(mid_point_leaf[0] - center[1])
        score = distance * area
        w_bottom += score
    for leaf in left:
        area = leaf[2] * leaf[3]
        mid_point_leaf = [leaf[0] + leaf[2] / 2, leaf[1] + leaf[3] / 2]
        distance = abs(mid_point_leaf[1] - center[0])
        score = distance * area
        w_left += score
    for leaf in right:
        area = leaf[2] * leaf[3]
        mid_point_leaf = [leaf[0] + leaf[2] / 2, leaf[1] + leaf[3] / 2]
        distance = abs(mid_point_leaf[1] - center[0])
        score = distance * area
        w_right += score

    IB_left_right = (w_left - w_right) / max(abs(w_right), abs(w_right))
    IB_top_bottom = (w_top - w_bottom) / max(abs(w_top), abs(w_bottom))
    BM = 1 - float(abs(IB_top_bottom) + abs(IB_left_right)) / 2

    return BM

In [7]:
def symmetry(leaves, width, height):
    UL_leaves = []
    UR_leaves = []
    LL_leaves = []
    LR_leaves = []

    for leaf in leaves:
        if leaf[0] > width / 2 and leaf[1] < height / 2:
            UR_leaves.append(leaf)
        elif leaf[0] <= width / 2 and leaf[1] < height / 2:
            UL_leaves.append(leaf)
        elif leaf[0] > width / 2 and leaf[1] >= height / 2:
            LR_leaves.append(leaf)
        elif leaf[0] <= width / 2 and leaf[1] >= height / 2:
            LL_leaves.append(leaf)

    X_j = []
    Y_j = []
    H_j = []
    B_j = []
    T_j = []
    R_j = []

    all_leaves = [UL_leaves, UR_leaves, LL_leaves, LR_leaves]
    x_center = width / 2
    y_center = height / 2
    
    # With j being respectively: UL;UR,LL;LR
    for j in all_leaves:
        X_score = 0
        Y_score = 0
        H_score = 0
        B_score = 0
        T_score = 0
        R_score = 0
        for leaf in j:
            x_leaf = leaf[0] + leaf[2] / 2
            X_score += abs(x_leaf - x_center)
            y_leaf = leaf[1] + leaf[3] / 2
            Y_score += abs(y_leaf - y_center)
            H_score += leaf[3]
            B_score += leaf[2]
            T_score += abs(y_leaf - y_center) / abs(x_leaf - x_center)
            R_score += (((x_leaf - x_center) ** 2) + ((y_leaf - y_center) ** 2)) ** 0.5

        X_j.append(X_score)
        Y_j.append(Y_score)
        H_j.append(H_score)
        B_j.append(B_score)
        T_j.append(T_score)
        R_j.append(R_score)

    # Normalize
    X_j[:] = [x / max(X_j) for x in X_j]
    Y_j[:] = [y / max(Y_j) for y in Y_j]
    H_j[:] = [h / max(H_j) for h in H_j]
    B_j[:] = [b / max(B_j) for b in B_j]
    T_j[:] = [r / max(R_j) for r in R_j]
    R_j[:] = [t / max(T_j) for t in T_j]

    SYM_ver = (abs(X_j[0] - X_j[1]) + abs(X_j[2] - X_j[3]) + abs(Y_j[0] - Y_j[1]) + abs(Y_j[2] - Y_j[3]) + abs(
        H_j[0] - H_j[1]) + abs(H_j[2] - H_j[3]) + abs(B_j[0] - B_j[1]) + abs(B_j[2] - B_j[3]) + abs(
        T_j[0] - T_j[1]) + abs(T_j[2] - T_j[3]) + abs(R_j[0] - R_j[1]) + abs(R_j[2] - R_j[3])) / 12

    SYM_hor = (abs(X_j[0] - X_j[2]) + abs(X_j[1] - X_j[3]) + abs(Y_j[0] - Y_j[2]) + abs(Y_j[1] - Y_j[3]) + abs(
        H_j[0] - H_j[2]) + abs(H_j[1] - H_j[3]) + abs(B_j[0] - B_j[2]) + abs(B_j[1] - B_j[3]) + abs(
        T_j[0] - T_j[2]) + abs(T_j[1] - T_j[3]) + abs(R_j[0] - R_j[2]) + abs(R_j[1] - R_j[3])) / 12

    SYM_rot = (abs(X_j[0] - X_j[3]) + abs(X_j[1] - X_j[2]) + abs(Y_j[0] - Y_j[3]) + abs(Y_j[1] - Y_j[2]) + abs(
        H_j[0] - H_j[3]) + abs(H_j[1] - H_j[2]) + abs(B_j[0] - B_j[3]) + abs(B_j[1] - B_j[2]) + abs(
        T_j[0] - T_j[3]) + abs(T_j[1] - T_j[2]) + abs(R_j[0] - R_j[3]) + abs(R_j[1] - R_j[2])) / 12

    SYM = 1 - (abs(SYM_ver) + abs(SYM_hor) + abs(SYM_rot)) / 3

    return SYM

In [8]:
##-------------------Quadtree Functions---------------------##
# Currently RGB entropy is calculated and intensity.
# The papers also refer to textons. This is not implemented as of yet:
# Representing and Recognizing the Visual Appearance of Materials using Three-dimensional Textons
# THOMAS LEUNG AND JITENDRA MALIK, International Journal of Computer Vision 43(1), 29-44, 2001
def intensity_entropy(inp):
    img = color.rgb2lab(inp)
    l_bins = 20
    L = []
    img = img.reshape(-1, 3)
    img = [tuple(l) for l in img]
    for pixel in img:
        L.append(pixel[0])

    p, x = np.histogram(L, bins=l_bins, range=(0, 100), normed=True)
    p.ravel()
    p = p * 100.
    p = p + 0.000000000001
    p_log = [math.log(y) for y in p]
    p_result = p * p_log
    result = np.sum(p_result)

    return result

In [9]:
# The uncertainty of colour in a leaf, given the leaf. Based on the shannon entropy
def color_entropy(inp):
    inp = inp / 255.
    img = color.rgb2hsv(inp)
    h_bins = 30
    s_bins = 32
    H = []
    S = []
    img = img.reshape(-1, 3)
    img = [tuple(l) for l in img]
    for pixel in img:
        H.append(pixel[0] * 360.)
        S.append(pixel[1] * 100.)

    h, x = np.histogram(H, bins=h_bins, range=(0, 360), density=True)
    s, y = np.histogram(S, bins=s_bins, range=(0, 100), density=True)

    h = h.ravel()
    h = h * 100.
    h = h + 0.000000000001
    h_log = [math.log(y) for y in h]
    h_result = h * h_log

    s = s.ravel()
    s = s * 100.
    s = s + 0.000000000001
    s_log = [math.log(y) for y in s]
    s_result = s * s_log
    result = abs(np.sum(h_result) + np.sum(s_result)) / 2

    return result

In [10]:
# Recursion
def quadtree(leaf, res_leaf, cor_size, i):
    ent_color = color_entropy(leaf)
    ent_int = intensity_entropy(leaf)
    height, width, depth = leaf.shape
    color_thres = 55 # Some threshold that seems okay, this seems quite heavily website dependend. (eg google vs alibaba)
    intensity_thresh = 70 # This is totally based on nothing. So somebody should figure this out (well the entropy in general)
    # It is partially based on the paper, however it seems to be working different)

    # If entropy fullfulls requirements or the website has not been divided in enough leaves and there is still room for division:
    if (ent_color < color_thres or ent_int > intensity_thresh or i < 2) and height / 2 > 8 and width / 2 > 8:
        i += 1
        # Divide the leaf in 4 new leaves
        new_leaf = [leaf[0:int(height / 2), 0:int(width / 2)], leaf[int(height / 2):height, 0:int(width / 2)],
                    leaf[0:int(height / 2), int(width / 2):width], leaf[int(height / 2):height, int(width / 2):width]]

        # Coordinates and size of each leaf
        new_cor_size = [(cor_size[0] + 0, cor_size[1] + 0, width / 2, height / 2),
                        (cor_size[0] + 0, cor_size[1] + height / 2, width / 2, height / 2),
                        (cor_size[0] + width / 2, cor_size[1] + 0, width / 2, height / 2),
                        (cor_size[0] + width / 2, cor_size[1] + height / 2, width / 2, height / 2)
                        ]
        for x in range(len(new_leaf)):
            # Run recursively
            quadtree(new_leaf[x], res_leaf, new_cor_size[x], i)
    else:
        # If not, append the coordinates and size
        res_leaf.append(cor_size)
    return

In [11]:
def execute():
    value_list = []
    for i in infiles:
        no, title = i.split('*')
        with open('webpagesJ/'+ title +'.jpg', 'rb') as f:
            b64 = f.read()
        # img = base64.b64encode(im)
        # b64 = base64.b64decode(b64)
        b64 = BytesIO(b64)
        img = Image.open(b64)
        img = img.resize((1920,900))
#         img = img.convert("RGB") 
        img = np.array(img)
        img = util.img_as_ubyte(img)

        res_leaf = []
        cor_size = (0, 0, img.shape[1], img.shape[0])
        quadtree(img, res_leaf, cor_size, 0)
        #    fig, ax = plt.subplots(1)
        #    for rect in res_leaf:
        #        rect = patches.Rectangle((rect[0], rect[1]), rect[2], rect[3], linewidth=0.1, edgecolor='b', facecolor='none')
        #        ax.add_patch(rect)

        #   ax.imshow(img)
        #   plt.show()
        b = balance(res_leaf, img.shape[1], img.shape[0])
        s = symmetry(res_leaf, img.shape[1], img.shape[0])
        e = equilibrium(res_leaf, img.shape[1], img.shape[0])
        n = len(res_leaf)
        quad_value = [title, b, s, e, n]
        print(quad_value)
        value_list.append(quad_value)
        
    with open('quad_value_1.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'balance', 'equilibrium', 'symmetry', 'nodes'])
        writer.writerows(value_list)

In [12]:
execute()

  from ipykernel import kernelapp as app


['theboneandjointcenter.com', 0.7750123075181643, 0.7851156967720234, 0.9995460809270914, 1726]
['two-n.com', 0.6922080491207202, 0.6785771906594285, 0.9991973494771053, 976]
['disney.co.jp', 0.7820291899083391, 0.7625454482321217, 0.9994873867793853, 1528]
['wikipedia.org', -0.3332708599022265, 0.5164257648764361, 0.9900850768654578, 79]
['news.yahoo.co.jp', 0.6547901512089485, 0.5297835592311071, 0.9992566203994234, 1054]
['huxiu.com', 0.4731843458566731, 0.46562742578439753, 0.9991692240301876, 943]
['cheshi.com', 0.7821000391889188, 0.8391007248065141, 0.9995421721041498, 1711]
['humblebundle.com', 0.9168405490454274, 0.9014683243987648, 0.999764171396102, 3322]
['theatlantic.com', 0.5308945753811878, 0.7114435798214322, 0.9994436474565122, 1408]
['microsoft.com', 0.8209056945876312, 0.6656634466410847, 0.9995211836578116, 1636]
['opera.com', 0.43200478913265195, 0.4714518169952858, 0.9987525740696844, 628]
['labinthewild.org', 0.8186997762053496, 0.8733814382933845, 0.999488482824