In [1]:
import os
import json
import pandas as pd
import math
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
body_width = None

def extract_semantic_features(node, text_nodes_with_content):
    """
    Calculate the semantic features from the nearest text node.
    
    Args:
    node (dict): Current node being processed
    text_nodes_with_content (list): List of text nodes with their x, y coordinates and text content
    
    Returns:
    dict: Semantic feature dictionary
    """
    if not text_nodes_with_content:
        return {
            "nearest_text_semantic_vector": [0] * 384,  # Default zero vector
            "nearest_text_semantic_distance": 9999999
        }
    
    # Get current node's center coordinates
    node_data = node.get("node", {})
    x = node_data.get("x", 0) + node_data.get("width", 0) / 2
    y = node_data.get("y", 0) + node_data.get("height", 0) / 2
    
    # Calculate Euclidean distances to all text nodes and find the nearest
    min_distance = float('inf')
    nearest_text_node = None
    
    for text_node in text_nodes_with_content:
        tx, ty = text_node['x'], text_node['y']
        distance = math.sqrt((x - tx)**2 + (y - ty)**2)
        
        if distance < min_distance:
            min_distance = distance
            nearest_text_node = text_node
    
    # If a nearest text node is found, get its semantic embedding
    if nearest_text_node and nearest_text_node.get('text'):
        # Generate semantic embedding for the text
        semantic_vector = semantic_model.encode(nearest_text_node['text'])
        
        return {
            "nearest_text_semantic_vector": semantic_vector.tolist(),  # Convert to list for JSON serialization
            "nearest_text_semantic_distance": min_distance
        }
    
    # Fallback if no meaningful text is found
    return {
        "nearest_text_semantic_vector": [0] * 384,  # Default zero vector
        "nearest_text_semantic_distance": 9999999
    }

def color_difference(color1, color2):
    """
    Calculate a perceptual color difference between two RGB colors using 
    a simplified version of the Delta E formula.
    Returns a value between 0 and 1, where 0 means identical and 1 means completely different.
    """
    if not all([color1, color2]):
        return 0
    
    # Extract RGB values
    r1, g1, b1 = color1
    r2, g2, b2 = color2
    
    # Calculate Euclidean distance in RGB space (simplified)
    distance = math.sqrt((r2-r1)**2 + (g2-g1)**2 + (b2-b1)**2)
    
    # Normalize to 0-1 range (max possible distance is sqrt(3 * 255^2))
    max_distance = math.sqrt(3 * 255**2)
    normalized_distance = distance / max_distance
    
    return normalized_distance

def extract_features(node, depth=0, parent_tag=None, sibling_count=0, parent_tag_html=None, prev_sibling_tag=None,parent_height=0, parent_bg_color=None, text_nodes=None):
    global body_width
    # First pass: Collect text nodes if not provided
    if text_nodes is None:
        def collect_text_nodes(node):
            text_nodes_list = []
            # Function to check if a node has meaningful text
            def has_meaningful_text(node_data):
                return node_data.get('type','') == "TEXT"
            
            node_data = node.get("node", {})
            # If this node has meaningful text
            if has_meaningful_text(node_data):
                text_nodes_list.append({
                    'x': node_data.get("x", 0) + node_data.get("width", 0) / 2,
                    'y': node_data.get("y", 0) + node_data.get("height", 0) / 2,
                    'text': node_data.get('characters', '').strip()
                })
            
            # Recursively check children
            for child in node.get("children", []):
                text_nodes_list.extend(collect_text_nodes(child))
            
            return text_nodes_list
        
        text_nodes = collect_text_nodes(node)
    
    
    features = []
    
    tag = node.get("tag", "")
    node_data = node.get("node", {})
    node_type = str(node_data.get("type", ""))

    text = node_data.get("characters", "")
    text_length = len(text)
    word_count = len(text.split()) if text else 0
    contains_number = any(ch.isdigit() for ch in text)
    contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text)
    
    children = node.get("children", [])
    num_direct_children = len(children)
    is_leaf = 1 if num_direct_children == 0 else 0
    
    # Initialize child tag features
    child_1_tag = None
    child_2_tag = None
    child_3_tag = None
    child_1_percent = 0
    child_2_percent = 0
    child_3_percent = 0
    
    # Calculate node area
    node_width = node_data.get("width", 0)
    if not body_width or body_width == 0:
        body_width = node_width
    node_height = node_data.get("height", 0)
    node_area = node_width * node_height
    
    # Extract child information if available
    if num_direct_children > 0:
        # Child 1
        if len(children) >= 1:
            child_1_tag = children[0].get("tag", "")
            child_1_width = children[0].get("node", {}).get("width", 0)
            child_1_height = children[0].get("node", {}).get("height", 0)
            child_1_area = child_1_width * child_1_height
            child_1_percent = (child_1_area / node_area) if node_area > 0 else 0
        
        # Child 2
        if len(children) >= 2:
            child_2_tag = children[1].get("tag", "")
            child_2_width = children[1].get("node", {}).get("width", 0)
            child_2_height = children[1].get("node", {}).get("height", 0)
            child_2_area = child_2_width * child_2_height
            child_2_percent = (child_2_area / node_area) if node_area > 0 else 0
        
        # Child 3
        if len(children) >= 3:
            child_3_tag = children[2].get("tag", "")
            child_3_width = children[2].get("node", {}).get("width", 0)
            child_3_height = children[2].get("node", {}).get("height", 0)
            child_3_area = child_3_width * child_3_height
            child_3_percent = (child_3_area / node_area) if node_area > 0 else 0
    
    # Count all children in the subtree (recursive count)
    def count_all_descendants(node):
        count = 0
        for child in node.get("children", []):
            # Count this child
            count += 1
            # Add all its descendants
            count += count_all_descendants(child)
        return count
    
    # Count chars to the end
    def count_chars_to_end(node):
        count = 0
        for child in node.get("children", []):
            # Count this child
            node_data = child.get("node", {})
            count += len(node_data.get("characters", ""))
            # Add all its descendants
            count += count_chars_to_end(child)
        return count
    
    # get center of weight
    def get_center_of_weight(node):
        total_area = 0
        total = 0
        for child in node.get("children", []):
            node_data = child.get("node", {})
            x_center = node_data.get("x",0) + node_data.get("width",0) / 2
            area = node_data.get("width",0) * node_data.get("height",0)
            total += area * x_center
            total_area += area
        weighted_x = total/(total_area if total_area else 1)
        diff = abs(node.get('x',0)-weighted_x) / (node.get('width',0) if node.get('width',0) else 1)
        return diff if node.get('width',0) else 9999999
    
    # Calculate total descendants
    num_children_to_end = count_all_descendants(node)
    chars_count_to_end = count_chars_to_end(node)
    bg_color = None
    feature = {
        "tag": tag,
        "type": node_type,
        "x": node_data.get("x", 0),
        "y": node_data.get("y", 0),
        "width": node_width/(body_width if body_width else 1),
        "height": node_height/(parent_height if parent_height else node_height if node_height else 1),
        "characters": text,
        "has_text": int(bool(text)),
        "depth": depth,
        "num_direct_children": num_direct_children,
        "num_children_to_end": num_children_to_end,  # Total descendants count
        "parent_tag": parent_tag if parent_tag else "",
        "parent_tag_html": parent_tag_html if parent_tag_html else "",
        "sibling_count": sibling_count,
        "prev_sibling_html_tag": prev_sibling_tag if prev_sibling_tag else "",
        "is_leaf": is_leaf,
        "font_size": node_data.get("fontSize", 16),
        "has_font_size": int("fontSize" in node_data),
        "font_name": node_data.get("fontName", {}).get("style", "") if node_data.get("fontName") else "normal",
        "has_text_color": 0, "color_r": 0, "color_g": 0, "color_b": 0,
        "has_background_color": 0, "background_r": 0, "background_g": 0, "background_b": 0,
        "border_radius": 0,
        "border_r": 0, "border_g": 0, "border_b": 0,
        "has_border": 0, "border_opacity": 0,
        "border_weight": node_data.get("strokeWeight", 0),
        "has_shadow": 0, "shadow_r": 0, "shadow_g": 0, "shadow_b": 0,
        "shadow_radius": 0, 
        "text_length": text_length,
        "chars_count_to_end": chars_count_to_end,
        "word_count": word_count,
        "contains_number": int(contains_number),
        "contains_special_chars": int(contains_special_chars),
        "aspect_ratio": node_width / node_height if node_height > 0 else 0,
        "child_1_html_tag": child_1_tag,
        "child_2_html_tag": child_2_tag,
        "child_3_html_tag": child_3_tag,
        "child_1_percentage_of_parent": child_1_percent,
        "child_2_percentage_of_parent": child_2_percent,
        "child_3_percentage_of_parent": child_3_percent,
        "distinct_background": 0,
        "center_of_weight_diff": get_center_of_weight(node),
    }
    
    # Extract fills (background and text color)
    fills = node_data.get("fills", [])
    for fill in fills:
        if fill.get("type") == "SOLID" and "color" in fill:
            r, g, b = (
                int(fill["color"].get("r", 0) * 255),
                int(fill["color"].get("g", 0) * 255),
                int(fill["color"].get("b", 0) * 255),
            )
            feature["color_r"], feature["color_g"], feature["color_b"] = r, g, b
            feature["has_text_color"] = 1  # Flag indicating explicit text color is set
            
            feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
            feature["has_background_color"] = 1  # Flag for explicit background color
            a = min(float(fill["color"].get("a", 1)),float(fill.get("opacity",1)))
            
            bg_color = (r*a, g*a, b*a)
            check = "NO"
            if parent_bg_color:
                bg_difference = color_difference(bg_color, parent_bg_color)               
                # If difference is significant (threshold of 0.3 - adjust as needed)
                if bg_difference > 0.2:
                    feature["distinct_background"] = 1    
            break
    
    # Also check backgrounds for background color
    backgrounds = node_data.get("backgrounds", [])
    for bg in backgrounds:
        if bg.get("type") == "SOLID" and "color" in bg:
            r, g, b = (
                int(bg["color"].get("r", 0) * 255),
                int(bg["color"].get("g", 0) * 255),
                int(bg["color"].get("b", 0) * 255),
            )
            feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
            feature["has_background_color"] = 1  # Flag for explicit background color
            
           
                
            break
    
    # Extract strokes (borders)
    strokes = node_data.get("strokes", [])
    if strokes:
        stroke = strokes[0]
        feature["has_border"] = 1
        if "color" in stroke:
            feature["border_r"], feature["border_g"], feature["border_b"] = (
                int(stroke["color"].get("r", 0) * 255),
                int(stroke["color"].get("g", 0) * 255),
                int(stroke["color"].get("b", 0) * 255),
            )
        feature["border_opacity"] = stroke.get("opacity", 0)
    
    # Extract border radius
    br_top_left = node_data.get("topLeftRadius", 0)
    br_top_right = node_data.get("topRightRadius", 0)
    br_bottom_left = node_data.get("bottomLeftRadius", 0)
    br_bottom_right = node_data.get("bottomRightRadius", 0)
    
    if any([br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
        feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
        if feature["border_radius"] >= 50:
            feature["border_radius"] = 0
    
    # Extract shadow
    effects = node_data.get("effects", [])
    for effect in effects:
        if effect.get("type") == "DROP_SHADOW":
            feature["has_shadow"] = 1
            if "color" in effect:
                feature["shadow_r"], feature["shadow_g"], feature["shadow_b"] = (
                    int(effect["color"].get("r", 0) * 255),
                    int(effect["color"].get("g", 0) * 255),
                    int(effect["color"].get("b", 0) * 255),
                )
            feature["shadow_radius"] = effect.get("radius", 0)
            break  
    
    # Get semantic features for the current node
    semantic_features = extract_semantic_features(node, text_nodes)
    
    # Calculate nearest text node distance
    nearest_text_distance = semantic_features.get('nearest_text_semantic_distance',0)
    nearest_text_semantic = semantic_features.get('nearest_text_semantic_vector',[0]*384)
    
    # Add nearest text node distance to the feature dictionary
    feature["nearest_text_node_dist"] = (nearest_text_distance) / (math.sqrt((node_width)* (node_height)) if math.sqrt((node_width)*(node_height)) else 1)
    feature["nearest_text_semantic"] = nearest_text_semantic
    features.append(feature)
    
    # Process children with previous sibling information
    prev_sib_tag = None
    for child in children:
        features.extend(extract_features(
            child, 
            depth=depth+1, 
            parent_tag=node_type, 
            sibling_count=len(children)-1, 
            parent_tag_html=tag,
            prev_sibling_tag=prev_sib_tag,
            parent_height= node_height,
            parent_bg_color=bg_color if feature["has_background_color"] else parent_bg_color,
            text_nodes=text_nodes
        ))
        prev_sib_tag = child.get("tag", "")
    
    return features

In [4]:



# Folder containing JSON files
data_folder = "../json_data"
output_csv_file = "figma_dataset.csv"

normalize_columns = [
    # "area",
    # "word_count",
    # "text_length",
    # "font_size",
    # "sibling_count",
    # "num_children",
    # "height",
    # "width",
    # "depth",
    # "nearest_text_node_dist"
]



# If the output CSV exists, remove it so we start fresh
if os.path.exists(output_csv_file):
    os.remove(output_csv_file)

# Flag to write header only for the first batch
first_batch = True

# Iterate over all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        print(f"Processing {file_path}...")
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Extract features using the recursive function starting at the root
        features_list = extract_features(data, depth=0, parent_tag=None, sibling_count=0, parent_tag_html=None, parent_height= 0, parent_bg_color=None)
        if not features_list:
            continue  # Skip if no features extracted
        
        df = pd.DataFrame(features_list)

        df = df.drop(columns=['x'])
        df = df.drop(columns=['y'])
        df = df.drop(columns=['characters'])
        df = df.drop(columns=['font_size'])
        df = df.drop(columns=['font_name'])
        df = df.drop(columns=['color_r'])
        df = df.drop(columns=['color_g'])
        df = df.drop(columns=['color_b'])
        df = df.drop(columns=['background_r'])
        df = df.drop(columns=['background_g'])
        df = df.drop(columns=['background_b'])
        # df = df.drop(columns=['border_radius'])
        df = df.drop(columns=['border_r'])
        df = df.drop(columns=['border_g'])
        df = df.drop(columns=['border_b'])
        df = df.drop(columns=['border_opacity'])
        df = df.drop(columns=['border_weight'])
        df = df.drop(columns=['shadow_r'])
        df = df.drop(columns=['shadow_g'])
        df = df.drop(columns=['shadow_b'])
        df = df.drop(columns=['shadow_radius'])
        df = df.drop(columns=['word_count'])
        df = df.drop(columns=['contains_special_chars'])
        df = df.drop(columns=['contains_number'])
        df = df.drop(columns=['has_shadow'])
        df = df.drop(columns=['has_border'])
        df = df.drop(columns=['has_text_color'])
        # df = df.drop(columns=['height'])
        df = df.drop(columns=['has_text'])
        df = df.drop(columns=['depth'])
        df = df.drop(columns=['has_font_size'])
        df = df.drop(columns=['parent_tag'])
        df = df.drop(columns=['parent_tag_html'])
        df = df.drop(columns=['is_leaf'])
        df = df.drop(columns=['center_of_weight_diff'])
        df = df.drop(columns=['child_3_html_tag'])
        df = df.drop(columns=['child_3_percentage_of_parent'])
        df = df.drop(columns=['num_direct_children'])


        df.to_csv(output_csv_file, mode='a', header=first_batch, index=False)
        first_batch = False

print(f"Extracted features from all JSON files have been saved to {output_csv_file}")

Processing ../json_data\figmaTree_1.json...
Processing ../json_data\figmaTree_10.json...
Processing ../json_data\figmaTree_100.json...
Processing ../json_data\figmaTree_10000.json...
Processing ../json_data\figmaTree_101.json...
Processing ../json_data\figmaTree_102.json...
Processing ../json_data\figmaTree_103.json...
Processing ../json_data\figmaTree_104.json...
Processing ../json_data\figmaTree_105.json...
Processing ../json_data\figmaTree_106.json...
Processing ../json_data\figmaTree_107.json...
Processing ../json_data\figmaTree_108.json...
Processing ../json_data\figmaTree_109.json...
Processing ../json_data\figmaTree_11.json...
Processing ../json_data\figmaTree_110.json...
Processing ../json_data\figmaTree_111.json...
Processing ../json_data\figmaTree_112.json...
Processing ../json_data\figmaTree_113.json...
Processing ../json_data\figmaTree_114.json...
Processing ../json_data\figmaTree_115.json...
Processing ../json_data\figmaTree_116.json...
Processing ../json_data\figmaTree_11

KeyboardInterrupt: 