In [None]:
import os
import json
import pandas as pd
from collections import Counter
import math

In [None]:
def color_difference(color1, color2):
    if not all([color1, color2]):
        return 0
    
    r1, g1, b1 = color1
    r2, g2, b2 = color2
    
    distance = math.sqrt((r2-r1)**2 + (g2-g1)**2 + (b2-b1)**2)
    
    max_distance = math.sqrt(3 * 255**2)
    normalized_distance = distance / max_distance
    
    return normalized_distance

In [None]:
def extract_features(node, depth=0, parent_tag=None, sibling_count=0, parent_tag_html=None, 
                    prev_sibling_tag=None, parent_prev_sibling_tag=None, parent_bg_color=None):
    features = []
    text_nodes = []
    svg_image_nodes = []
    
    tag = node.get("tag", "")
    node_data = node.get("node", {})
    node_type = str(node_data.get("type", ""))

    text = node_data.get("characters", "")
    text_length = len(text)
    word_count = len(text.split()) if text else 0
    contains_number = any(ch.isdigit() for ch in text)
    contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text)
    
    children = node.get("children", [])
    num_children = len(children)
    is_leaf = 1 if num_children == 0 else 0
    
    total_descendants = num_children
    
    total_text_nodes = 1 if node_type == "TEXT" else 0
    
    total_text_length = text_length if node_type == "TEXT" else 0
    
    # Use BFS to count all descendants and text nodes
    queue = list(children)
    while queue:
        child = queue.pop(0)
        child_children = child.get("children", [])
        child_type = str(child.get("node", {}).get("type", ""))
        child_text = child.get("node", {}).get("characters", "")
        
        total_descendants += len(child_children)
        
        if child_type == "TEXT":
            total_text_nodes += 1
            total_text_length += len(child_text)
            
        queue.extend(child_children)
    
    bg_color = None
    has_distinct_bg = 0
    bg_difference = 0

    feature = {
        "tag": tag,
        "type": node_type,
        "x": node_data.get("x", 0),
        "y": node_data.get("y", 0),
        "width": node_data.get("width", 0),
        "height": node_data.get("height", 0),
        "characters": text,
        "has_text": int(bool(text)),
        "depth": depth,
        "num_children": num_children,
        "total_descendants": total_descendants,
        "total_text_nodes": total_text_nodes,  # Count of all text nodes
        "total_text_length": total_text_length,  # New feature: sum of all text lengths
        "parent_tag": parent_tag if parent_tag else "",
        "parent_tag_html": parent_tag_html if parent_tag_html else "",
        "prev_sibling_tag": prev_sibling_tag if prev_sibling_tag else "",
        "parent_prev_sibling_tag": parent_prev_sibling_tag if parent_prev_sibling_tag else "",
        "sibling_count": sibling_count,
        "is_leaf": is_leaf,
        "font_size": node_data.get("fontSize", 16),
        "has_font_size": int("fontSize" in node_data),
        "font_name": node_data.get("fontName", {}).get("style", "") if node_data.get("fontName") else "normal",
        "has_text_color": 0, "color_r": 0, "color_g": 0, "color_b": 0,
        "has_background_color": 0, "background_r": 0, "background_g": 0, "background_b": 0,
        "distinct_background": 0,  
        "border_radius": 0,
        "border_r": 0, "border_g": 0, "border_b": 0,
        "has_border": 0, "border_opacity": 0,
        "border_weight": node_data.get("strokeWeight", 0),
        "has_shadow": 0, "shadow_r": 0, "shadow_g": 0, "shadow_b": 0,
        "shadow_radius": 0, 
        "text_length": text_length,
        "word_count": word_count,
        "contains_number": int(contains_number),
        "contains_special_chars": int(contains_special_chars),
    }
    
    fills = node_data.get("fills", [])
    for fill in fills:
        if fill.get("type") == "SOLID" and "color" in fill:
            r, g, b = (
                int(fill["color"].get("r", 0) * 255),
                int(fill["color"].get("g", 0) * 255),
                int(fill["color"].get("b", 0) * 255),
            )
            feature["color_r"], feature["color_g"], feature["color_b"] = r, g, b
            feature["has_text_color"] = 1  
            
            feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
            feature["has_background_color"] = 1 
    
            bg_color = (r, g, b)
            
            if parent_bg_color:
                bg_difference = color_difference(bg_color, parent_bg_color)
                
                
                if bg_difference > 0.3:
                    feature["distinct_background"] = 1
            else:   
                bg_difference = color_difference(bg_color, (125, 125, 125))

                if bg_difference > 0.3:
                    feature["distinct_background"] = 1


            break  
    
    # Extract strokes (borders)
    strokes = node_data.get("strokes", [])
    if strokes:
        stroke = strokes[0]
        feature["has_border"] = 1
        if "color" in stroke:
            feature["border_r"], feature["border_g"], feature["border_b"] = (
                int(stroke["color"].get("r", 0) * 255),
                int(stroke["color"].get("g", 0) * 255),
                int(stroke["color"].get("b", 0) * 255),
            )
        feature["border_opacity"] = stroke.get("opacity", 0)
    
    # Extract border radius
    br_top_left = node_data.get("topLeftRadius", 0)
    br_top_right = node_data.get("topRightRadius", 0)
    br_bottom_left = node_data.get("bottomLeftRadius", 0)
    br_bottom_right = node_data.get("bottomRightRadius", 0)
    
    if any([br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
        feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
    
    # Extract shadow
    effects = node_data.get("effects", [])
    for effect in effects:
        if effect.get("type") == "DROP_SHADOW":
            feature["has_shadow"] = 1
            if "color" in effect:
                feature["shadow_r"], feature["shadow_g"], feature["shadow_b"] = (
                    int(effect["color"].get("r", 0) * 255),
                    int(effect["color"].get("g", 0) * 255),
                    int(effect["color"].get("b", 0) * 255),
                )
            feature["shadow_radius"] = effect.get("radius", 0)
            break  
    
    features.append(feature)
    
    # Add to text_nodes if this is a TEXT node
    if node_type == "TEXT":
        text_nodes.append({
            "characters": text,
            "x": node_data.get("x", 0),
            "y": node_data.get("y", 0),
            "width": node_data.get("width", 0),
            "height": node_data.get("height", 0),
        })
    
    # Add to svg_image_nodes if this is an SVG or IMAGE node
    if tag in ["SVG", "IMAGE", "PICTURE"]:
        svg_image_nodes.append({
            "tag": tag,
            "type": node_type,
            "x": node_data.get("x", 0),
            "y": node_data.get("y", 0),
            "width": node_data.get("width", 0),
            "height": node_data.get("height", 0),
            "depth": depth,
            "parent_tag": parent_tag if parent_tag else "",
            "parent_tag_html": parent_tag_html if parent_tag_html else ""
        })
    
    # Process children with updated sibling information
    prev_child_tag = None
    for child in children:
        child_features, child_text_nodes, child_svg_image_nodes = extract_features(
            child, 
            depth=depth+1, 
            parent_tag=node_type, 
            sibling_count=len(children)-1, 
            parent_tag_html=tag,
            prev_sibling_tag=prev_child_tag,
            parent_prev_sibling_tag=prev_sibling_tag,
            parent_bg_color=bg_color  # Pass the current node's background color to its children
        )
        features.extend(child_features)
        text_nodes.extend(child_text_nodes)
        svg_image_nodes.extend(child_svg_image_nodes)
        
        # Update the previous sibling tag for the next child
        prev_child_tag = child.get("tag", "")
    
    return features, text_nodes, svg_image_nodes


In [None]:
# Folder containing JSON files
data_folder = "../json_data"
output_csv_file = "figma_dataset.csv"

# normalize on body features
normalize_columns = [
    "area",
    "word_count",
    # "text_length",
    "font_size",
    # "sibling_count",
    # "num_children",
    "height",
    "width",
    "depth"
]

# Add the necessary import if not already present
import os
import json
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

# If the output CSV exists, remove it so we start fresh
if os.path.exists(output_csv_file):
    os.remove(output_csv_file)

# Flag to write header only for the first batch
first_batch = True

# Define a function to calculate Euclidean distance between two nodes
def calculate_distance(node1, node2):
    # Calculate centers of nodes
    node1_center_x = node1['x'] + node1['width'] / 2
    node1_center_y = node1['y'] + node1['height'] / 2
    node2_center_x = node2['x'] + node2['width'] / 2
    node2_center_y = node2['y'] + node2['height'] / 2
    
    # Calculate Euclidean distance between centers
    if any(np.isnan([node1_center_x, node1_center_y, node2_center_x, node2_center_y])) or any(np.isinf([node1_center_x, node1_center_y, node2_center_x, node2_center_y])):
        return float('inf')
    return euclidean((node1_center_x, node1_center_y), (node2_center_x, node2_center_y))

# Iterate over all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        print(f"Processing {file_path}...")
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Extract features using the recursive function starting at the root
        features_list, child_text_nodes, child_svg_image_nodes = extract_features(data, depth=0, parent_tag=None, sibling_count=0, parent_tag_html=None)
        if not features_list:
            continue  # Skip if no features extracted
        
        df = pd.DataFrame(features_list)
        
        # Normalize positions per JSON file to avoid cross-file influence
        min_x = df['x'].min() if df['x'].notnull().any() else 0
        min_y = df['y'].min() if df['y'].notnull().any() else 0
        df['x_normalized'] = df['x'] - min_x
        df['y_normalized'] = df['y'] - min_y
        
        df['x_center'] = df['x'] + df['width'] / 2
        df['y_center'] = df['y'] + df['height'] / 2
        
        # Attempt to compute total dimensions using a BODY tag if available
        body_node = df[df['tag'] == 'BODY']
        if not body_node.empty:
            total_width = body_node.iloc[0]['width']
            total_height = body_node.iloc[0]['height']
        else:
            total_width = (df['x'] + df['width']).max()
            total_height = (df['y'] + df['height']).max()
        
        # Avoid division by zero
        if total_width and total_height:
            df['x_quarter'] = df['x_center'] / total_width
            df['y_quarter'] = df['y_center'] / total_height
        else:
            df['x_quarter'] = None
            df['y_quarter'] = None
        
        df['aspect_ratio'] = df.apply(
            lambda row: row['width'] / row['height'] if row['height'] and row['height'] != 0 else None, axis=1
        )
        df['area'] = df['width'] * df['height']
        if total_width:
            df['normalized_width'] = df['width'] / total_width
        else:
            df['normalized_width'] = None
        if total_height:
            df['normalized_height'] = df['height'] / total_height
        else:
            df['normalized_height'] = None

        # 1. Add feature: distance to nearest text node and its characters
        df['nearest_text_distance'] = None
        df['nearest_text_content'] = None
        
        # Define threshold distance for considering an image "nearby"
        threshold_distance = total_width * 0.2  # 20% of total width as threshold
        
        # 2. Add feature: size of nearest image/svg and distance to it
        df['nearest_image_size'] = None
        df['nearest_image_distance'] = None
        
        # Process each node to find nearest text and image
        for idx, node in df.iterrows():
            # Get node center
            node_center_x = node['x_center']
            node_center_y = node['y_center']
            node_data = {
                'x': node['x'],
                'y': node['y'],
                'width': node['width'],
                'height': node['height']
            }
            
            # 1. Find nearest text node
            min_text_distance = float('inf')
            nearest_text_content = ""
            
            for text_node in child_text_nodes:
                distance = calculate_distance(node_data, text_node)
                
                if distance < min_text_distance:
                    min_text_distance = distance
                    nearest_text_content = text_node.get('characters', '')
            
            if min_text_distance != float('inf'):
                df.at[idx, 'nearest_text_distance'] = min_text_distance
                df.at[idx, 'nearest_text_content'] = nearest_text_content
            else:
                df.at[idx, 'nearest_text_distance'] = -1  # No text nodes found
                df.at[idx, 'nearest_text_content'] = ""
            
            # 2. Find nearest image/svg within threshold
            min_image_distance = float('inf')
            nearest_image_size = 0
            
            for image_node in child_svg_image_nodes:
                distance = calculate_distance(node_data, image_node)
                
                if distance < min_image_distance:
                    min_image_distance = distance
                    # Calculate image size (area)
                    image_size = image_node.get('width', 0) * image_node.get('height', 0)
                    nearest_image_size = image_size
            
            if min_image_distance != float('inf'):
                df.at[idx, 'nearest_image_distance'] = min_image_distance
                
                # Store image size if within threshold, otherwise 0
                if min_image_distance <= threshold_distance:
                    df.at[idx, 'nearest_image_size'] = nearest_image_size
                else:
                    df.at[idx, 'nearest_image_size'] = 0
            else:
                df.at[idx, 'nearest_image_distance'] = -1  # No image nodes found
                df.at[idx, 'nearest_image_size'] = 0

        # Normalize the new features
        if 'nearest_text_distance' in normalize_columns:
            max_text_dist = df['nearest_text_distance'].max()
            if max_text_dist > 0:
                df['nearest_text_distance'] = df['nearest_text_distance'] / max_text_dist
        
        if 'nearest_image_size' in normalize_columns:
            max_image_size = df['nearest_image_size'].max()
            if max_image_size > 0:
                df['nearest_image_size'] = df['nearest_image_size'] / max_image_size
        
        if 'nearest_image_distance' in normalize_columns:
            max_image_dist = df['nearest_image_distance'].max()
            if max_image_dist > 0:
                df['nearest_image_distance'] = df['nearest_image_distance'] / max_image_dist

        # Compute min and max for each column
        min_max_values = {col: (df[col].min(), df[col].max()) for col in normalize_columns}

        # Apply Min-Max normalization (scaling between 0 and 1)
        for col in normalize_columns:
            min_val, max_val = min_max_values[col]
            if max_val > min_val and max_val != 0:  # Avoid division by zero
                df[col] = df[col] / max_val
            else:
                df[col] = 0  # If min and max are the same, set to 0

        # Remove columns that aren't needed
        columns_to_drop = [
            'x', 'y', 'x_normalized', 'y_normalized', 'x_center', 'y_center', 
            'characters', 'font_size', 'font_name', 'color_r', 'color_g', 'color_b',
            'background_r', 'background_g', 'background_b', 'border_radius',
            'border_r', 'border_g', 'border_b', 'border_opacity', 'border_weight',
            'shadow_r', 'shadow_g', 'shadow_b', 'shadow_radius', 'word_count',
            'normalized_width', 'normalized_height', 'contains_special_chars',
            'contains_number', 'has_shadow', 'has_border', 'has_text_color',
            'height', 'has_text', 'x_quarter', 'y_quarter', 'area', 'has_font_size'
        ]
        
        # Safely drop columns that exist
        for col in columns_to_drop:
            if col in df.columns:
                df = df.drop(columns=[col])
        
        # Write to CSV
        df.to_csv(output_csv_file, mode='a', header=first_batch, index=False)
        first_batch = False

print(f"Extracted features from all JSON files have been saved to {output_csv_file}")

Processing ../json_data\figmaTree_1.json...
Processing ../json_data\figmaTree_10.json...
Processing ../json_data\figmaTree_100.json...
Processing ../json_data\figmaTree_101.json...
Processing ../json_data\figmaTree_102.json...
Processing ../json_data\figmaTree_11.json...
Processing ../json_data\figmaTree_12.json...
Processing ../json_data\figmaTree_13.json...
Processing ../json_data\figmaTree_14.json...
Processing ../json_data\figmaTree_16.json...
Processing ../json_data\figmaTree_18.json...
Processing ../json_data\figmaTree_19.json...
Processing ../json_data\figmaTree_2.json...
Processing ../json_data\figmaTree_20.json...
Processing ../json_data\figmaTree_21.json...
Processing ../json_data\figmaTree_22.json...
Processing ../json_data\figmaTree_23.json...
Processing ../json_data\figmaTree_24.json...
Processing ../json_data\figmaTree_25.json...
Processing ../json_data\figmaTree_26.json...
Processing ../json_data\figmaTree_27.json...
Processing ../json_data\figmaTree_28.json...
Processin

: 