In [1]:
import os
import sys
import json
import pandas as pd
import math

# Add the parent directory (where utils.py is located) to sys.path to ensure utils module is found
utils_path = os.path.abspath(os.path.join(os.getcwd(), "../../Utils/"))
sys.path.append(utils_path)

# Import utility functions from the utils module
from utils import (
    verb_ratio,
    is_near_gray,
    find_nearest_text_node,
    color_difference,
    collect_text_nodes,
    count_all_descendants,
    count_chars_to_end,
    get_center_of_weight
)

# Global variables for normalization
body_width = None
body_height = None
num_nodes = None
num_chars = None


In [2]:
def extract_features(node, depth = 0, parent_tag = None, sibling_count = 0,
                    parent_tag_html = None, prev_sibling_tag = None, parent_height = 0,
                    parent_bg_color = None, text_nodes = None):
    """Recursively extract features from a figma node

    Args:
        node : The current node.
        depth : Current depth in the node tree
        parent_tag : figma type of the parent node
        sibling_count : Number of siblings
        parent_tag_html : HTML tag of the parent node
        prev_sibling_tag : HTML tag of the previous sibling
        parent_height : Height of the parent node => (used for normalization)
        parent_bg_color : Background color of the parent => (used for color difference calculation)
        text_nodes : List of text node coordinates => (called at first call only then propagated)

    Returns:
        List of feature vectors

    Steps:
        - Collects text nodes on the first pass only
        - Handles special cases for INPUT tags
        - Extract features
    """
    global body_width
    global body_height
    global num_nodes
    global num_chars

    try:
        # First pass only: Collect text nodes
        if text_nodes is None:
            text_nodes = collect_text_nodes(node)

        features = []

        # Determine tag with special handling for INPUT nodes => (For old dataset checkbox and radio were INPUT)
        tag = node.get("tag", "")
        if tag == "INPUT":
            node_data = node.get("node", {})
            width = node_data.get("width", 0)
            if width < 50:
                node_type = node_data.get("type", "RECTANGLE")
                if node_type == "RECTANGLE":
                    tag = "CHECKBOX"
                elif node_type == "ELLIPSE":
                    tag = "RADIO"
        node["tag"] = tag
        node_data = node.get("node", {})
        node_type = str(node_data.get("type", ""))

        # Extract text-related features
        text = node_data.get("characters", "")
        text_length = len(text)
        word_count = len(text.split()) if text else 0
        contains_number = any(ch.isdigit() for ch in text)
        contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text)

        # Extract structural features
        children = node.get("children", [])
        num_direct_children = len(children)
        is_leaf = 1 if num_direct_children == 0 else 0

        # Initialize child-related features
        child_1_tag, child_2_tag, child_3_tag = None, None, None
        child_1_percent, child_2_percent, child_3_percent = 0.0, 0.0, 0.0

        # Calculate node area and initialize global dimensions
        node_width = node_data.get("width", 0)
        node_height = node_data.get("height", 0)
        if not body_width or body_width == 0:
            body_width = node_width
        if not body_height or body_height == 0:
            body_height = node_height
        node_area = node_width * node_height

        has_placeholder = 0
        is_verb = 0

        # Extract child information if available => (Each child tag and its percentage of the parent area)
        if num_direct_children > 0:
            if len(children) >= 1:
                child_1_tag = children[0].get("tag", "")
                child_1_type = children[0].get("node", {}).get("type", "")
                if child_1_type == "TEXT":
                    if is_verb == 0:
                        is_verb = verb_ratio(children[0].get("node", {}).get("characters", ""))
                    placeholder_fills = children[0].get("node", {}).get("fills", [])
                    placeholder_fills = [fill for fill in placeholder_fills if fill and (color := fill.get("color")) and color.get("a", 1) > 0]
                    for fill in placeholder_fills:
                        if fill.get("type") == "SOLID" and "color" in fill:
                            r, g, b = (
                                int(fill["color"].get("r", 0) * 255),
                                int(fill["color"].get("g", 0) * 255),
                                int(fill["color"].get("b", 0) * 255),
                            )
                            if is_near_gray(r, g, b):
                                has_placeholder = 1
                            break
                if child_1_tag == "INPUT" and children[0].get("node", {}).get("width", 0) < 50:
                    if children[0].get("node", {}).get("type", "RECTANGLE") == "RECTANGLE":
                        child_1_tag = "CHECKBOX"
                    elif children[0].get("node", {}).get("type", "ELLIPSE") == "ELLIPSE":
                        child_1_tag = "RADIO"
                child_1_width = children[0].get("node", {}).get("width", 0)
                child_1_height = children[0].get("node", {}).get("height", 0)
                child_1_area = child_1_width * child_1_height
                child_1_percent = child_1_area / node_area if node_area > 0 else 0.0

            if len(children) >= 2:
                child_2_tag = children[1].get("tag", "")
                child_2_type = children[1].get("node", {}).get("type", "")
                if child_2_type == "TEXT" and is_verb == 0:
                    is_verb = verb_ratio(children[1].get("node", {}).get("characters", ""))
                if child_2_tag == "INPUT" and children[1].get("node", {}).get("width", 0) < 50:
                    if children[1].get("node", {}).get("type", "RECTANGLE") == "RECTANGLE":
                        child_2_tag = "CHECKBOX"
                    elif children[1].get("node", {}).get("type", "ELLIPSE") == "ELLIPSE":
                        child_2_tag = "RADIO"
                child_2_width = children[1].get("node", {}).get("width", 0)
                child_2_height = children[1].get("node", {}).get("height", 0)
                child_2_area = child_2_width * child_2_height
                child_2_percent = child_2_area / node_area if node_area > 0 else 0.0

            if len(children) >= 3:
                child_3_tag = children[2].get("tag", "")
                child_3_type = children[2].get("node", {}).get("type", "")
                if child_3_type == "TEXT" and is_verb == 0:
                    is_verb = verb_ratio(children[2].get("node", {}).get("characters", ""))
                if child_3_tag == "INPUT" and children[2].get("node", {}).get("width", 0) < 50:
                    if children[2].get("node", {}).get("type", "RECTANGLE") == "RECTANGLE":
                        child_3_tag = "CHECKBOX"
                    elif children[2].get("node", {}).get("type", "ELLIPSE") == "ELLIPSE":
                        child_3_tag = "RADIO"
                child_3_width = children[2].get("node", {}).get("width", 0)
                child_3_height = children[2].get("node", {}).get("height", 0)
                child_3_area = child_3_width * child_3_height
                child_3_percent = child_3_area / node_area if node_area > 0 else 0.0

        # Calculate total descendants and characters to the end
        num_children_to_end = count_all_descendants(node)
        if not num_nodes or num_nodes == 0:
            num_nodes = num_children_to_end
        chars_count_to_end = count_chars_to_end(node)
        if not num_chars or num_chars == 0:
            num_chars = chars_count_to_end

        # Initialize feature Vector
        bg_color = None
        feature = {
            "tag": tag,
            "type": node_type,
            "x": node_data.get("x", 0),
            "y": node_data.get("y", 0) / (body_height if body_height else 1),
            "width": node_width / (body_width if body_width else 1),
            "height": node_height / (parent_height if parent_height else node_height if node_height else 1),
            "characters": text,
            "has_text": int(bool(text)),
            "depth": depth,
            "num_direct_children": num_direct_children,
            "num_children_to_end": num_children_to_end / (num_nodes if num_nodes else 1),
            "parent_tag": parent_tag if parent_tag else "",
            "parent_tag_html": parent_tag_html if parent_tag_html else "",
            "sibling_count": sibling_count,
            "prev_sibling_html_tag": prev_sibling_tag if prev_sibling_tag else "",
            "is_leaf": is_leaf,
            "font_size": node_data.get("fontSize", 16),
            "has_font_size": int("fontSize" in node_data),
            "font_name": node_data.get("fontName", {}).get("style", "") if node_data.get("fontName") else "normal",
            "has_text_color": 0, "color_r": 0, "color_g": 0, "color_b": 0,
            "has_background_color": 0, "background_r": 0, "background_g": 0, "background_b": 0,
            "border_radius": 0,
            "border_r": 0, "border_g": 0, "border_b": 0,
            "has_border": 0, "border_opacity": 0,
            "border_weight": node_data.get("strokeWeight", 0),
            "has_shadow": 0, "shadow_r": 0, "shadow_g": 0, "shadow_b": 0,
            "shadow_radius": 0,
            "text_length": text_length,
            "chars_count_to_end": chars_count_to_end / (num_chars if num_chars else 1),
            "word_count": word_count,
            "contains_number": int(contains_number),
            "contains_special_chars": int(contains_special_chars),
            "aspect_ratio": node_width / node_height if node_height > 0 else 0,
            "child_1_html_tag": child_1_tag,
            "child_2_html_tag": child_2_tag,
            "child_3_html_tag": child_3_tag,
            "child_1_percentage_of_parent": child_1_percent,
            "child_2_percentage_of_parent": child_2_percent,
            "child_3_percentage_of_parent": child_3_percent,
            "distinct_background": 0,
            "center_of_weight_diff": get_center_of_weight(node),
            "is_verb": is_verb,
            "has_placeholder": has_placeholder
        }

        # Extract fills (background and text color)
        fills = node_data.get("fills", [])
        fills = [fill for fill in fills if fill and (color := fill.get("color")) and color.get("a", 1) > 0]
        for fill in fills:
            if fill.get("type") == "SOLID" and "color" in fill:
                r, g, b = (
                    int(fill["color"].get("r", 0) * 255),
                    int(fill["color"].get("g", 0) * 255),
                    int(fill["color"].get("b", 0) * 255),
                )
                feature["color_r"], feature["color_g"], feature["color_b"] = r, g, b
                feature["has_text_color"] = 1
                feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
                feature["has_background_color"] = 1
                a = min(float(fill["color"].get("a", 1)), float(fill.get("opacity", 1)))
                bg_color = (r * a, g * a, b * a)
                if parent_bg_color:
                    feature["distinct_background"] = color_difference(bg_color, parent_bg_color)
                break

        # Extract backgrounds for background color => (special case for Group figma type)
        backgrounds = node_data.get("backgrounds", [])
        for bg in backgrounds:
            if bg.get("type") == "SOLID" and "color" in bg:
                r, g, b = (
                    int(bg["color"].get("r", 0) * 255),
                    int(bg["color"].get("g", 0) * 255),
                    int(bg["color"].get("b", 0) * 255),
                )
                feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
                feature["has_background_color"] = 1
                break

        # Extract strokes (borders)
        strokes = node_data.get("strokes", [])
        strokes = [stroke for stroke in strokes if stroke and (color := stroke.get("color")) and color.get("a", 1) > 0]
        if strokes:
            stroke = strokes[0]
            feature["has_border"] = 1
            if "color" in stroke:
                feature["border_r"], feature["border_g"], feature["border_b"] = (
                    int(stroke["color"].get("r", 0) * 255),
                    int(stroke["color"].get("g", 0) * 255),
                    int(stroke["color"].get("b", 0) * 255),
                )
            feature["border_opacity"] = stroke.get("opacity", 0)

        # Extract border radius
        br_top_left = node_data.get("topLeftRadius", 0)
        br_top_right = node_data.get("topRightRadius", 0)
        br_bottom_left = node_data.get("bottomLeftRadius", 0)
        br_bottom_right = node_data.get("bottomRightRadius", 0)
        if any([br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
            feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
            if feature["border_radius"] >= 50:
                feature["border_radius"] = 0

        # Extract shadow effects
        effects = node_data.get("effects", [])
        for effect in effects:
            if effect.get("type") == "DROP_SHADOW":
                feature["has_shadow"] = 1
                if "color" in effect:
                    feature["shadow_r"], feature["shadow_g"], feature["shadow_b"] = (
                        int(effect["color"].get("r", 0) * 255),
                        int(effect["color"].get("g", 0) * 255),
                        int(effect["color"].get("b", 0) * 255),
                    )
                feature["shadow_radius"] = effect.get("radius", 0)
                break

        # Calculate nearest text node distance
        nearest_text_distance = find_nearest_text_node(node, text_nodes)
        area = node_width * node_height if node_width * node_height > 0 else 0
        feature["nearest_text_node_dist"] = (nearest_text_distance+0.01) / (math.sqrt((area+0.001)) if math.sqrt((area+0.001)) else 1)

        features.append(feature)

        # Process children recursively
        prev_sib_tag = None
        for child in children:
            features.extend(extract_features(
                child,
                depth=depth + 1,
                parent_tag=node_type,
                sibling_count=len(children) - 1,
                parent_tag_html=tag,
                prev_sibling_tag=prev_sib_tag,
                parent_height=node_height,
                parent_bg_color=bg_color if feature["has_background_color"] and node_type != "GROUP" else parent_bg_color,
                text_nodes=text_nodes
            ))
            prev_sib_tag = child.get("tag", "")

    except KeyError as e:
        print(f"Error processing node: Missing key {e}. Skipping node.")
        features = []
    except Exception as e:
        print(f"Unexpected error processing node: {e}. Skipping node.")
        features = []

    return features


In [3]:
# Define input and output paths
DATA_FOLDER = "../../Data/new_json_data9"
OUTPUT_CSV_FILE = "../Output/new_figma_dataset.csv"

# Remove existing output CSV
if os.path.exists(OUTPUT_CSV_FILE):
    os.remove(OUTPUT_CSV_FILE)

# Flag to write header only for the first batch
first_batch = True

# Process each JSON file in the data folder
for filename in os.listdir(DATA_FOLDER):
    if filename.endswith(".json"):
        file_path = os.path.join(DATA_FOLDER, filename)
        print(f"Processing {file_path}...")
        
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                data = json.load(file)
            
            # Extract features starting from the root node
            features_list = extract_features(data, depth=0, parent_tag=None, sibling_count=0,
                                          parent_tag_html=None, parent_height=0, parent_bg_color=None)
            if not features_list:
                print(f"No features extracted from {file_path}. Skipping.")
                continue
            
            # Convert to DataFrame and drop specified columns
            df = pd.DataFrame(features_list)
            # Drop some features => decided by the EDA phase
            columns_to_drop = [
                "x", "y", "characters", "font_size", "font_name", "color_r", "color_g", "color_b",
                "background_r", "background_g", "background_b", "border_r", "border_g", "border_b",
                "border_opacity", "border_weight", "shadow_r", "shadow_g", "shadow_b", "shadow_radius",
                "word_count", "contains_special_chars", "contains_number", "has_shadow", "has_border",
                "has_text_color", "has_text", "depth", "has_font_size", "parent_tag", "is_leaf",
                "child_3_html_tag", "child_3_percentage_of_parent", "num_direct_children", "text_length",
                "chars_count_to_end", "num_children_to_end"
            ]
            df = df.drop(columns=columns_to_drop, errors='ignore')

            # Append to CSV with header for the first batch only
            df.to_csv(OUTPUT_CSV_FILE, mode="a", header=first_batch, index=False)
            first_batch = False
        
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON file {file_path}: {e}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_path}: {e}. Skipping.")

print(f"Extracted features from all JSON files have been saved to {OUTPUT_CSV_FILE}")


Processing ../../Data/new_json_data9\figmaTree_1.json...
Processing ../../Data/new_json_data9\figmaTree_10.json...
Processing ../../Data/new_json_data9\figmaTree_100.json...
Processing ../../Data/new_json_data9\figmaTree_101.json...
Processing ../../Data/new_json_data9\figmaTree_102.json...
Processing ../../Data/new_json_data9\figmaTree_103.json...
Processing ../../Data/new_json_data9\figmaTree_104.json...
Processing ../../Data/new_json_data9\figmaTree_105.json...
Processing ../../Data/new_json_data9\figmaTree_107.json...
Processing ../../Data/new_json_data9\figmaTree_108.json...
Processing ../../Data/new_json_data9\figmaTree_109.json...
Processing ../../Data/new_json_data9\figmaTree_11.json...
Processing ../../Data/new_json_data9\figmaTree_110.json...
Processing ../../Data/new_json_data9\figmaTree_111.json...
Processing ../../Data/new_json_data9\figmaTree_112.json...
Processing ../../Data/new_json_data9\figmaTree_113.json...
Processing ../../Data/new_json_data9\figmaTree_114.json...
P