In [16]:
from operator import itemgetter
from pathlib import Path
from functools import reduce
from fastcore.foundation import *
from datetime import datetime

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from fastcore.utils import parallel
from sklearn.model_selection import train_test_split

from pygame import Rect

plt.rcParams["figure.figsize"] = 16, 9
plt.style.use("dark_background")

In [17]:
rico_path = Path("../data/rico")
annotations_path = rico_path / "rico_annotations"
screenshots_path = rico_path / "combined"

dataset_path = Path("../data")

In [18]:
get_pt1 = itemgetter("xmin", "ymin")
get_pt2 = itemgetter("xmax", "ymax")

In [19]:
column_names = [
    "filename",
    "image_width",
    "image_height",
    "category",
    "xmin",
    "ymin",
    "xmax",
    "ymax",
    "width",
    "height",
    "area",
    "level",
    "is_parent",
    "text",
]

column_dtypes = [
    "string",
    np.int,
    np.int,
    "string",
    np.int,
    np.int,
    np.int,
    np.int,
    np.int,
    np.int,
    np.int,
    np.int,
    bool,
    "string",
]

column_dtype_map = dict(zip(column_names, column_dtypes))

In [20]:
version_history = {}

def snapshot(df, version):
    tmstmp = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")
    df.reset_index(drop=True, inplace=True)
    filepath = dataset_path/f'{tmstmp}_snapshot_after_{version}.ft'
    df.to_feather(filepath)
    
    version_history[version] = filepath
    
def load_snapshot(version):
    filepath = version_history.get(version, None)
    if filepath is None:
        raise Exception('Version not found')
    return pd.read_feather(filepath)

In [21]:
class ChainedAssignment:

    """ Context manager to temporarily set pandas chained assignment warning. Usage:
    
        with ChainedAssignment():
             blah  
             
        with ChainedAssignment('error'):
             run my code and figure out which line causes the error! 
    
    """

    def __init__(self, chained = None):
        acceptable = [ None, 'warn','raise']
        assert chained in acceptable, "chained must be in " + str(acceptable)
        self.swcw = chained

    def __enter__( self ):
        self.saved_swcw = pd.options.mode.chained_assignment
        pd.options.mode.chained_assignment = self.swcw
        return self

    def __exit__(self, *args):
        pd.options.mode.chained_assignment = self.saved_swcw

In [22]:
def draw_bndboxes(a_df, return_img=False):
    w, h = a_df.iloc[0, [-2,-1]].values
    image = (np.ones((h, w, 3)) * 255).astype(np.uint8)
    for i, annotation in a_df.iterrows():
        pt1 = get_pt1(annotation)
        pt2 = get_pt2(annotation)
        x, y = pt1
        category = str(i) + annotation.category[:3]
        cv2.rectangle(image, pt1, pt2, (250, 20, 30), 2)
        cv2.putText(image, category, (x, y-10), 0, 1, (250, 20, 30), 2, cv2.LINE_AA, False)
    
    plt.imshow(image)
    
    if return_img:
        return image
    
    return None

In [23]:
def draw_bndboxes_with_rects(a_df, rects, return_img=False):
    w, h = a_df.iloc[0, [-2,-1]].values
    image = (np.ones((h, w, 3)) * 255).astype(np.uint8)
    for i, annotation in a_df.iterrows():
        pt1 = get_pt1(annotation)
        pt2 = get_pt2(annotation)
        x, y = pt1
        category = str(i) + annotation.category[:3]
        cv2.rectangle(image, pt1, pt2, (250, 20, 30), 2)
        cv2.putText(image, category, (x, y-10), 0, 1, (250, 20, 30), 2, cv2.LINE_AA, False)
    
    for r in rects:
        cv2.rectangle(image, (r.left, r.top),(r.right, r.bottom), (20, 30, 250), 2 )
    
    plt.imshow(image)
    
    if return_img:
        return image
    
    return None

# Cleanup

In [24]:
# rico_annotation_full.csv 保存的是页面中各元素的信息
rico_annotations_df = pd.read_csv(dataset_path / "rico_annotation_full.csv", encoding='utf-8',low_memory=False)

In [25]:
rico_annotations_df.text = rico_annotations_df.text.fillna("-")     # 填充缺失数据

rico_annotations_df.dropna(inplace=True)    # 移除存在空缺值的行

In [26]:
# 移除 area 小于 0 的元素
to_remove = rico_annotations_df.query("area <= 0")

rico_annotations_df.drop(index=to_remove.index, inplace=True)

In [27]:
rico_annotations_df = rico_annotations_df.astype(column_dtype_map)

In [28]:
## Because rico annotation roots are wrong!

rico_annotations_df.image_width = 1440
rico_annotations_df.image_height = 2560

In [29]:
rico_annotations_df.category.value_counts()

Text                 454606
Image                217281
Icon                 178974
List Item            152107
Text Button          138787
Toolbar               34853
Web View              30775
Input                 21408
Card                  16889
Advertisement         13333
Background Image       6785
Drawer                 6642
Radio Button           5419
Checkbox               4241
Multi-Tab              4185
Pager Indicator        4143
Modal                  3959
On/Off Switch          2103
Slider                 2016
Map View               1511
Button Bar              721
Video                   562
Bottom Navigation       523
Number Stepper          427
Date Picker             291
Name: category, dtype: Int64

In [30]:
annotations_df = rico_annotations_df.copy()

# RICO to UISketch

## Remove redundant items

In [31]:
# Before removing items
annotations_df.shape

(1302541, 14)

In [32]:
categories_to_remove = ["Advertisement",
                        "Background Image",
                        "Bottom Navigation",
                        "Button Bar",
                        "Date Picker",
                        "List Item",
                        "Map View",
                        "Multi-Tab",
                        "Pager Indicator",
                        "Toolbar",
                        "Web View" ]

In [33]:
annotations_df = annotations_df[~annotations_df.category.isin(categories_to_remove)]

# after removing items
annotations_df.shape

(1053314, 14)

In [34]:
annotations_df.category.value_counts()

Text              454606
Image             217281
Icon              178974
Text Button       138787
Input              21408
Card               16889
Drawer              6642
Radio Button        5419
Checkbox            4241
Modal               3959
On/Off Switch       2103
Slider              2016
Video                562
Number Stepper       427
Name: category, dtype: Int64

In [35]:
# 二进制形式保存 dataframe
snapshot(annotations_df, "1")

## Map RICO elements to UISketch elements

In [36]:
rico_uisketch_map = {
    "Background Image": "image",
    "Card": "card",
    "Checkbox": "checkbox_checked",
    "Drawer": "menu",
    "Icon": "image",
    "Image": "image",
    "On/Off Switch": "switch_disabled",
    "Radio Button": "radio_button_checked",
    "Slider": "slider",
    "Text": "label",
    "Text Button": "button",
    "Video": "image",
    "Modal": "alert",
    # Set number stepper as input and split input later
    "Number Stepper": "Input"
}

In [37]:
annotations_df.loc[:, ["category"]] = annotations_df.category.apply(lambda x: rico_uisketch_map.get(x, x))

### Equally distribute UI element states

#### CheckBox

In [38]:
to_modify = annotations_df.query("category == 'checkbox_checked'").sample(frac=0.5)

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("checkbox_checked", "checkbox_unchecked")

#### Switch

In [39]:
to_modify = annotations_df.query("category == 'switch_disabled'").sample(frac=0.5)

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("switch_disabled", "switch_enabled")

#### Radio button

In [40]:
to_modify = annotations_df.query("category == 'radio_button_checked'").sample(frac=0.5)

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("radio_button_checked", "radio_button_unchecked")

In [41]:
annotations_df.category.value_counts()

label                     454606
image                     396817
button                    138787
Input                      21835
card                       16889
menu                        6642
alert                       3959
radio_button_unchecked      2710
radio_button_checked        2709
checkbox_checked            2121
checkbox_unchecked          2120
slider                      2016
switch_enabled              1052
switch_disabled             1051
Name: category, dtype: int64

### Modify UI elements based on area it occupies

#### Input

In [42]:
# Input 中高度最高的 1/4 调整为 text_area
q3 = annotations_df.query("category == 'Input'").height.quantile(0.75)

to_modify = annotations_df.query(f"category == 'Input' & height > {q3}")

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("Input", "text_area")

In [43]:
# 剩余 Input 的一半调整为 dropdown_menu，一半为 text_field
to_modify = annotations_df.query("category == 'Input'").sample(frac=0.5)

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("Input", "dropdown_menu")
annotations_df.loc[~condition] = annotations_df.loc[~condition].replace("Input", "text_field")

#### Label

In [44]:
# label 中高度最高的 1/4，且满足宽度小于高度的元素，调整为 chip
q3 = annotations_df.query("category == 'label'").height.quantile(0.75)


to_modify = annotations_df.query(f"category == 'label' & height > {q3} & width < height")

condition = annotations_df.index.isin(to_modify.index)
annotations_df.loc[condition] = annotations_df.loc[condition].replace("label", "chip")

In [45]:
annotations_df.category.value_counts()

label                     442664
image                     396817
button                    138787
card                       16889
chip                       11942
text_field                  8672
dropdown_menu               8672
menu                        6642
text_area                   4491
alert                       3959
radio_button_unchecked      2710
radio_button_checked        2709
checkbox_checked            2121
checkbox_unchecked          2120
slider                      2016
switch_enabled              1052
switch_disabled             1051
Name: category, dtype: int64

In [46]:
snapshot(annotations_df, "2")

# Drop all outlier screenshots

In [47]:
category_count_per_file = annotations_df.groupby("filename")["category"].count()

In [48]:
q1 = category_count_per_file.quantile(.25)
q3 = category_count_per_file.quantile(.75)
iqr = q3-q1
q1, q3, iqr

(6.0, 23.0, 17.0)

In [49]:
outlier_min = q1 - 1.5 * iqr
outlier_max = q3 + 1.5 * iqr
outlier_min, outlier_max

(-19.5, 48.5)

In [50]:
category_count_per_file_without_outliers = category_count_per_file[category_count_per_file.between(outlier_min, outlier_max)]

In [51]:
annotations_df = annotations_df[annotations_df.filename.isin(category_count_per_file_without_outliers.index)]

In [52]:
annotations_df

Unnamed: 0,category,xmin,ymin,xmax,ymax,width,height,area,level,is_parent,text,filename,image_width,image_height
0,label,14,86,1426,167,1412,81,114372,2,False,Santa Scanner,22161.jpg,1440,2560
1,image,35,1849,385,2199,350,350,122500,2,False,-,22161.jpg,1440,2560
2,image,1055,1849,1405,2199,350,350,122500,2,False,-,22161.jpg,1440,2560
3,image,1125,1464,1405,1744,280,280,78400,2,False,-,22161.jpg,1440,2560
4,image,385,1989,1055,2199,670,210,140700,2,False,-,22161.jpg,1440,2560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053309,slider,215,2280,1316,2392,1101,112,123312,2,False,-,30576.jpg,1440,2560
1053310,label,1316,2305,1405,2367,89,62,5518,2,False,0:09,30576.jpg,1440,2560
1053311,label,70,154,1370,838,1300,684,889200,2,False,Pattern recorded!,25154.jpg,1440,2560
1053312,button,140,2070,699,2210,559,140,78260,2,False,Cancel,25154.jpg,1440,2560


## Drop all outlier elements from each category (area and aspect_ratio)

In [53]:
with ChainedAssignment():
    for category, df in annotations_df.groupby('category'):
        q1 = df.area.quantile(0.25)
        q3 = df.area.quantile(0.75)
        iqr = q3 - q1
        outlier_min = q1 - 1.5 * iqr
        outlier_max = q3 + 1.5 * iqr
        outliers_df = df[~df.area.between(outlier_min, outlier_max)]
        print(category, "-- Outliers to be dropped --" , len(outliers_df))
        annotations_df.drop(index=outliers_df.index, inplace=True)

alert -- Outliers to be dropped -- 0
button -- Outliers to be dropped -- 3235
card -- Outliers to be dropped -- 1007
checkbox_checked -- Outliers to be dropped -- 709
checkbox_unchecked -- Outliers to be dropped -- 658
chip -- Outliers to be dropped -- 1586
dropdown_menu -- Outliers to be dropped -- 0
image -- Outliers to be dropped -- 54635
label -- Outliers to be dropped -- 25349
menu -- Outliers to be dropped -- 27
radio_button_checked -- Outliers to be dropped -- 193
radio_button_unchecked -- Outliers to be dropped -- 173
slider -- Outliers to be dropped -- 90
switch_disabled -- Outliers to be dropped -- 188
switch_enabled -- Outliers to be dropped -- 190
text_area -- Outliers to be dropped -- 384
text_field -- Outliers to be dropped -- 0


In [54]:
annotations_df.shape

(813380, 14)

## Remove all outlier elements of aspect_ratio (except image)

In [55]:
with ChainedAssignment():
    for category, df in annotations_df.groupby('category'):
        if category == "image":
            continue
        aspect_ratios = df.apply(lambda row: row["width"] / row["height"], axis=1)
        q1 = aspect_ratios.quantile(0.25)
        q3 = aspect_ratios.quantile(0.75)
        iqr = q3 - q1
        outlier_min = q1 - 1.5 * iqr
        outlier_max = q3 + 1.5 * iqr
        outliers_df = df[~aspect_ratios.between(outlier_min, outlier_max)]
        print(category, "-- Outliers to be dropped --" , len(outliers_df))
        annotations_df.drop(index=outliers_df.index, inplace=True)

alert -- Outliers to be dropped -- 13
button -- Outliers to be dropped -- 3777
card -- Outliers to be dropped -- 646
checkbox_checked -- Outliers to be dropped -- 0
checkbox_unchecked -- Outliers to be dropped -- 0
chip -- Outliers to be dropped -- 0
dropdown_menu -- Outliers to be dropped -- 71
label -- Outliers to be dropped -- 10550
menu -- Outliers to be dropped -- 1239
radio_button_checked -- Outliers to be dropped -- 267
radio_button_unchecked -- Outliers to be dropped -- 301
slider -- Outliers to be dropped -- 60
switch_disabled -- Outliers to be dropped -- 48
switch_enabled -- Outliers to be dropped -- 10
text_area -- Outliers to be dropped -- 0
text_field -- Outliers to be dropped -- 76


In [56]:
annotations_df.shape


(796322, 14)

In [57]:
snapshot(annotations_df, "3")

# Remove Overlapping UI elements

In [58]:
annotations_df.insert(len(annotations_df.columns), 'to_keep', True)

In [59]:
def select_non_overlapping(df):
    
    rects = {}
    for idx, r in df.iterrows():
        rects[idx] = Rect(r.xmin, r.ymin, r.width, r.height)
    

    for idx, rect in rects.items():
        if annotations_df.iloc[idx, -1] == False:
            continue
            
        
        rest = {key: value for key, value in rects.items() if key != idx}
        collisions = rect.collidedictall(rest, 1)
        
        for idx, j in collisions:
            annotations_df.iloc[idx, -1] = False

In [60]:
with ChainedAssignment():
    for i, df in annotations_df.groupby('filename'):
        select_non_overlapping(df)

In [61]:
to_remove = annotations_df.query("to_keep == False")

annotations_df.drop(index=to_remove.index, inplace=True)

In [62]:
annotations_df.drop(columns=["to_keep"], inplace=True)

In [63]:
snapshot(annotations_df, "4")

# Find positions for other elements

In [64]:
to_remove = annotations_df.query("xmax >= image_width")

annotations_df.drop(index=to_remove.index, inplace=True)

In [65]:
to_remove = annotations_df.query("ymax >= image_height")

annotations_df.drop(index=to_remove.index, inplace=True)

## Tooltip positions

In [66]:
frame_rect = Rect(0, 0, 1440, 2560)

def find_tooltip_pos(data):
    filename, df, width, height = data
    offset = 20
    
    rects = {}
    for idx, r in df.iterrows():
        rects[idx] = Rect(r.xmin, r.ymin, r.width, r.height)

    candidates = {}
    to_keep = {}

    # Below
    for idx, r in rects.items():
        bot_tooltip_rect = Rect(r.left, r.top + r.height + offset, width, height)
        top_tooltip_rect = Rect(r.left, r.top - height - offset, width, height)

        if frame_rect.contains(bot_tooltip_rect):
            collisions = bot_tooltip_rect.collidedictall(rects, 1)

            if len(collisions) == 0:
                candidates[idx] = bot_tooltip_rect
                to_keep[idx] = True

        if frame_rect.contains(top_tooltip_rect):
            collisions = top_tooltip_rect.collidedictall(rects, 1)

            if len(collisions) == 0:
                candidates[idx] = top_tooltip_rect
                to_keep[idx] = True


    for idx, rect in candidates.items():
        if to_keep[idx] == False:
            continue

        rest = {key: value for key, value in candidates.items() if key != idx}
        collisions = rect.collidedictall(rest, 1)

        for idx, j in collisions:
            to_keep[idx] = False

    candidates = { key: value for key, value in candidates.items() if to_keep[key]}

    return filename, candidates

In [67]:
params = [(filename, a_df, 450, 300) for filename, a_df in annotations_df.groupby("filename")]

tooltip_f_pos = parallel(find_tooltip_pos, params)

tooltip_f_pos = tooltip_f_pos.filter(lambda x: x[1] )

### Take a sample of tooltips and merge it to annotations

In [68]:
sample_tooltip_f_pos = tooltip_f_pos.shuffle()[:3500]
tooltip_possible_count = reduce(lambda acc, x: acc + len(x[1]), sample_tooltip_f_pos, 0)

In [69]:
def rect_to_annotation(filename, category, r, min_w, max_w, min_h, max_h):
    w = np.random.randint(min_w, max_w)
    h = np.random.randint(min_h, max_h)
    data = [filename, 1440, 2560, category, r.left, r.top, r.left + w, r.top + h, w, h, w*h , 1, False, '']
    annotation = dict(zip(column_names, data))
    return annotation

tooltip_df_data = L()

for filename, rects in sample_tooltip_f_pos:
    for r in rects.values():
        annotation = rect_to_annotation(filename, 'tooltip', r, 350, 450, 200, 300)
        tooltip_df_data.append(annotation)

tooltip_df = pd.DataFrame(tooltip_df_data)

annotations_df = pd.concat([annotations_df, tooltip_df], ignore_index=True)

In [70]:
snapshot(annotations_df, "5_1")

## FAB

In [71]:
def find_free_spots(data):
    rect, filename, a_df = data
    rects = [Rect(r.xmin, r.ymin, r.width, r.height) for _, r in a_df.iterrows() ]
    if rect.collidelist(rects) == -1:
        return filename
    
    return None

In [72]:
fab_rect = Rect(1100, 2020, 300, 300)

params = [(fab_rect, filename, a_df) for filename, a_df in annotations_df.groupby("filename")]

fab_filenames = parallel(find_free_spots, params)

fab_filenames = fab_filenames.filter(lambda x: x is not None)

In [73]:
sample_fab_filenames = fab_filenames.shuffle()[:3500]

In [74]:
def data_to_annotation(filename, category, r, min_x, max_x, min_y, max_y, min_w, max_w, min_h, max_h):
    while True:
        x = np.random.randint(min_x, max_x)
        y = np.random.randint(min_y, max_y)
        w = np.random.randint(min_w, max_w)
        h = np.random.randint(min_h, max_h)
        
        if r.contains(Rect(x,y,w,h)):
            break
        
    data = [filename, 1440, 2560, category, x, y, x+w, y+h, w, h, w*h , 1, False, '']
    annotation = dict(zip(column_names, data))
    return annotation

fab_df_data = L()

for filename in sample_fab_filenames:
    annotation = data_to_annotation(filename, 'floating_action_button', fab_rect, 1100, 1150, 2020, 2070, 250, 300, 250, 300)
    fab_df_data.append(annotation)
    
fab_df = pd.DataFrame(fab_df_data)

annotations_df = pd.concat([annotations_df, fab_df], ignore_index=True)

In [75]:
snapshot(annotations_df, "5_2")

## Data Table & Grid List

In [76]:
dt_rect = Rect(20, 340, 1400, 1680)

params = [(dt_rect, filename, a_df) for filename, a_df in annotations_df.groupby("filename")]

dt_gl_filenames = parallel(find_free_spots, params)

dt_gl_filenames = dt_gl_filenames.filter(lambda x: x is not None)

In [77]:
sample_dt_gl_filenames = dt_gl_filenames.shuffle()[:7000]

In [78]:
dt_filenames, gl_filenames = sample_dt_gl_filenames[:3500], sample_dt_gl_filenames[3500:]

In [79]:
dt_df_data = L()

for filename in dt_filenames:
    annotation = data_to_annotation(filename, 'data_table', dt_rect, 20, 60, 340, 440, 1300, 1400, 1580, 1680)
    dt_df_data.append(annotation)

dt_df = pd.DataFrame(dt_df_data)

annotations_df = pd.concat([annotations_df, dt_df], ignore_index=True)

In [80]:
gl_df_data = L()

for filename in gl_filenames:
    annotation = data_to_annotation(filename, 'grid_list', dt_rect, 20, 60, 340, 440, 1300, 1400, 1580, 1680)
    gl_df_data.append(annotation)

gl_df = pd.DataFrame(gl_df_data)

annotations_df = pd.concat([annotations_df, gl_df], ignore_index=True)

In [81]:
snapshot(annotations_df, "5")

In [82]:
annotations_df.category.value_counts()

label                     224867
image                     190828
button                     72400
tooltip                     7584
card                        7464
text_field                  6149
dropdown_menu               6067
chip                        4893
menu                        3654
floating_action_button      3500
grid_list                   3500
data_table                  3500
alert                       3481
text_area                   2698
radio_button_checked        1628
radio_button_unchecked      1611
checkbox_unchecked          1124
slider                      1115
checkbox_checked            1075
switch_disabled              457
switch_enabled               435
Name: category, dtype: int64

In [83]:
annotations_df.to_csv(dataset_path/"SynZ_ready_annotations_full.csv", index=False)

In [84]:
filenames = annotations_df.filename.unique()

In [85]:
train_val_filenames, test_filenames = train_test_split(filenames, test_size=0.1, random_state=42)
train_filenames, val_filenames = train_test_split(train_val_filenames, test_size=0.1, random_state=42)

In [86]:
len(train_filenames), len(val_filenames), len(test_filenames)

(47352, 5262, 5846)

In [87]:
annotations_df.insert(1, "train_test_valid", "train")

In [88]:
annotations_df.loc[annotations_df.filename.isin(val_filenames), "train_test_valid"] = "valid"

In [89]:
annotations_df.loc[annotations_df.filename.isin(test_filenames), "train_test_valid"] = "test"

In [90]:
annotations_df.train_test_valid.value_counts()

train    442424
test      54149
valid     51457
Name: train_test_valid, dtype: int64

In [91]:
annotations_df.to_csv(dataset_path/"SynZ_ready_annotations.csv", index=False)