In [1]:
import os
import json
import pandas as pd

In [2]:
def extract_features(node, depth=0, parent_tag=None, sibling_count=0):
    features = []
    
    tag = node.get("tag", "")
    node_data = node.get("node", {})
    
    text = node_data.get("characters", "") or ""
    text_length = len(text)
    word_count = len(text.split())
    contains_number = any(ch.isdigit() for ch in text)
    contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text)
    text_type = "long" if word_count > 10 else "short"
    
    # Compute hierarchy features
    children = node.get("children", [])
    num_children = len(children)
    is_leaf = (num_children == 0)
    
    # Create the feature dictionary
    feature = {
        "tag": tag,
        "type": node_data.get("type", ""),
        "x": node_data.get("x", None),
        "y": node_data.get("y", None),
        "width": node_data.get("width", None),
        "height": node_data.get("height", None),
        "characters": text,
        # Hierarchy features
        "depth": depth,
        "num_children": num_children,
        "parent_tag": parent_tag,
        "sibling_count": sibling_count,
        "is_leaf": is_leaf,
        # Style features
        "font_size": node_data.get("fontSize", None),
        "font_weight": node_data.get("fontName", {}).get("style", None) if node_data.get("fontName") else None,
        "color": None,
        "background_color": None,
        "border_radius": None,
        "visibility": "visible",  # Default to visible
        "border_type": None,
        "border_color": None,
        "border_opacity": None,
        "border_weight": node_data.get("strokeWeight", None),
        "border_pattern": node_data.get("dashPattern", []),
        "shadow_type": None,
        "shadow_color": None,
        "shadow_offset": None,
        "shadow_radius": None,
        # Text features
        "text_length": text_length,
        "word_count": word_count,
        "contains_number": contains_number,
        "contains_special_chars": contains_special_chars,
        "text_type": text_type
    }
    
    # Extract color information from fills (if available)
    fills = node_data.get("fills", [])
    for fill in fills:
        if fill.get("type") == "SOLID" and "color" in fill:
            r = int(fill["color"].get("r", 0) * 255)
            g = int(fill["color"].get("g", 0) * 255)
            b = int(fill["color"].get("b", 0) * 255)
            rgb_str = f"rgb({r}, {g}, {b})"
            feature["color"] = rgb_str
            feature["background_color"] = rgb_str
            break  
    
    # Compute border information
    strokes = node_data.get("strokes", [])
    if strokes:
        stroke = strokes[0]
        feature["border_type"] = stroke.get("type", None)
        if "color" in stroke:
            r = int(stroke["color"].get("r", 0) * 255)
            g = int(stroke["color"].get("g", 0) * 255)
            b = int(stroke["color"].get("b", 0) * 255)
            feature["border_color"] = f"rgb({r}, {g}, {b})"
        feature["border_opacity"] = stroke.get("opacity", None)
    
    # Compute border_radius as the average of the four corner radii if available
    br_top_left = node_data.get("topLeftRadius", None)
    br_top_right = node_data.get("topRightRadius", None)
    br_bottom_left = node_data.get("bottomLeftRadius", None)
    br_bottom_right = node_data.get("bottomRightRadius", None)
    if all(v is not None for v in [br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
        feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
    
    # Extract shadow properties
    effects = node_data.get("effects", [])
    for effect in effects:
        if effect.get("type") == "DROP_SHADOW":
            feature["shadow_type"] = "DROP_SHADOW"
            if "color" in effect:
                r = int(effect["color"].get("r", 0) * 255)
                g = int(effect["color"].get("g", 0) * 255)
                b = int(effect["color"].get("b", 0) * 255)
                feature["shadow_color"] = f"rgb({r}, {g}, {b})"
            feature["shadow_offset"] = effect.get("offset", None)
            feature["shadow_radius"] = effect.get("radius", None)
            break  
    
    features.append(feature)
    
    # Recursively process children, updating hierarchy features.
    for child in children:
        features.extend(extract_features(child, depth=depth+1, parent_tag=tag, sibling_count=len(children)-1))
    
    return features

In [3]:
data_folder = "json_data"

# Initialize an empty DataFrame to store all extracted features
df_all = pd.DataFrame()

# Iterate over all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        print(f"Processing {file_path}...")
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Extract features using the recursive function starting at the root
        features_list = extract_features(data, depth=0, parent_tag=None, sibling_count=0)
        
        df = pd.DataFrame(features_list)
        
        min_x = df['x'].min() if df['x'].notnull().any() else 0
        min_y = df['y'].min() if df['y'].notnull().any() else 0

        df['x_normalized'] = df['x'] - min_x
        df['y_normalized'] = df['y'] - min_y

        df['x_center'] = df['x'] + df['width'] / 2
        df['y_center'] = df['y'] + df['height'] / 2

        body_node = df[df['tag'] == 'BODY']
        if not body_node.empty:
            total_width = body_node.iloc[0]['width']
            total_height = body_node.iloc[0]['height']
        else:
            total_width = (df['x'] + df['width']).max()
            total_height = (df['y'] + df['height']).max()

        df['x_quarter'] = df['x_center'] / total_width
        df['y_quarter'] = df['y_center'] / total_height

        df['aspect_ratio'] = df.apply(lambda row: row['width'] / row['height'] if row['height'] and row['height'] != 0 else None, axis=1)
        df['area'] = df['width'] * df['height']
        df['normalized_width'] = df['width'] / total_width
        df['normalized_height'] = df['height'] / total_height

        df_all = pd.concat([df_all, df], ignore_index=True)

output_csv_file = "features_data.csv"
df_all.to_csv(output_csv_file, index=False)
print(f"Extracted features from all JSON files have been saved to {output_csv_file}")


Processing json_data\figmaTree_1.json...
Extracted features from all JSON files have been saved to features_data.csv
