In [7]:
import os
import json
import pandas as pd

In [8]:
def extract_features(node, depth=0, parent_tag=None, sibling_count=0):
    features = []
    
    tag = node.get("tag", "")
    node_data = node.get("node", {})
    node_type = str(node_data.get("type", ""))

    text = node_data.get("characters", "")
    text_length = len(text)
    word_count = len(text.split()) if text else 0
    contains_number = any(ch.isdigit() for ch in text)
    contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text)
    
    children = node.get("children", [])
    num_children = len(children)
    is_leaf = 1 if num_children == 0 else 0
    
    feature = {
        "tag": tag,
        "type": node_type,
        "x": node_data.get("x", 0),
        "y": node_data.get("y", 0),
        "width": node_data.get("width", 0),
        "height": node_data.get("height", 0),
        "characters": text,
        "has_text": int(bool(text)),
        "depth": depth,
        "num_children": num_children,
        "parent_tag": parent_tag if parent_tag else "",
        "sibling_count": sibling_count,
        "is_leaf": is_leaf,
        "font_size": node_data.get("fontSize", 16),
        "has_font_size": int("fontSize" in node_data),
        "font_name": node_data.get("fontName", {}).get("style", "") if node_data.get("fontName") else "normal",
        "has_text_color": 0, "color_r": 0, "color_g": 0, "color_b": 0,
        "has_background_color": 0, "background_r": 0, "background_g": 0, "background_b": 0,
        "border_radius": 0,
        "border_r": 0, "border_g": 0, "border_b": 0,
        "has_border": 0, "border_opacity": 0,
        "border_weight": node_data.get("strokeWeight", 0),
        "has_shadow": 0, "shadow_r": 0, "shadow_g": 0, "shadow_b": 0,
        "shadow_radius": 0, 
        "text_length": text_length,
        "word_count": word_count,
        "contains_number": int(contains_number),
        "contains_special_chars": int(contains_special_chars),
    }
    
    # Extract fills (background and text color)
    fills = node_data.get("fills", [])
    for fill in fills:
        if fill.get("type") == "SOLID" and "color" in fill:
            r, g, b = (
                int(fill["color"].get("r", 0) * 255),
                int(fill["color"].get("g", 0) * 255),
                int(fill["color"].get("b", 0) * 255),
            )
            feature["color_r"], feature["color_g"], feature["color_b"] = r, g, b
            feature["has_text_color"] = 1  # Flag indicating explicit text color is set
            
            feature["background_r"], feature["background_g"], feature["background_b"] = r, g, b
            feature["has_background_color"] = 1  # Flag for explicit background color
            break  
    
    # Extract strokes (borders)
    strokes = node_data.get("strokes", [])
    if strokes:
        stroke = strokes[0]
        feature["has_border"] = 1
        if "color" in stroke:
            feature["border_r"], feature["border_g"], feature["border_b"] = (
                int(stroke["color"].get("r", 0) * 255),
                int(stroke["color"].get("g", 0) * 255),
                int(stroke["color"].get("b", 0) * 255),
            )
        feature["border_opacity"] = stroke.get("opacity", 0)
    
    # Extract border radius
    br_top_left = node_data.get("topLeftRadius", 0)
    br_top_right = node_data.get("topRightRadius", 0)
    br_bottom_left = node_data.get("bottomLeftRadius", 0)
    br_bottom_right = node_data.get("bottomRightRadius", 0)
    
    if any([br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
        feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
    
    # Extract shadow
    effects = node_data.get("effects", [])
    for effect in effects:
        if effect.get("type") == "DROP_SHADOW":
            feature["has_shadow"] = 1
            if "color" in effect:
                feature["shadow_r"], feature["shadow_g"], feature["shadow_b"] = (
                    int(effect["color"].get("r", 0) * 255),
                    int(effect["color"].get("g", 0) * 255),
                    int(effect["color"].get("b", 0) * 255),
                )
            feature["shadow_radius"] = effect.get("radius", 0)
            break  
    
    features.append(feature)
    
    for child in children:
        features.extend(extract_features(child, depth=depth+1, parent_tag=node_type, sibling_count=len(children)-1))
    
    return features


In [9]:
# Folder containing JSON files
data_folder = "test_json"
output_csv_file = "figma_dataset.csv"

normalize_columns = [
    "area",
    "word_count",
    "text_length",
    "font_size",
    "sibling_count",
    "num_children",
    "height",
    "width",
]



# If the output CSV exists, remove it so we start fresh
if os.path.exists(output_csv_file):
    os.remove(output_csv_file)

# Flag to write header only for the first batch
first_batch = True

# Iterate over all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        print(f"Processing {file_path}...")
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Extract features using the recursive function starting at the root
        features_list = extract_features(data, depth=0, parent_tag=None, sibling_count=0)
        if not features_list:
            continue  # Skip if no features extracted
        
        df = pd.DataFrame(features_list)
        
        # Normalize positions per JSON file to avoid cross-file influence
        min_x = df['x'].min() if df['x'].notnull().any() else 0
        min_y = df['y'].min() if df['y'].notnull().any() else 0
        df['x_normalized'] = df['x'] - min_x
        df['y_normalized'] = df['y'] - min_y
        
        df['x_center'] = df['x'] + df['width'] / 2
        df['y_center'] = df['y'] + df['height'] / 2
        
        # Attempt to compute total dimensions using a BODY tag if available
        body_node = df[df['tag'] == 'BODY']
        if not body_node.empty:
            total_width = body_node.iloc[0]['width']
            total_height = body_node.iloc[0]['height']
        else:
            total_width = (df['x'] + df['width']).max()
            total_height = (df['y'] + df['height']).max()
        
        # Avoid division by zero
        if total_width and total_height:
            df['x_quarter'] = df['x_center'] / total_width
            df['y_quarter'] = df['y_center'] / total_height
        else:
            df['x_quarter'] = None
            df['y_quarter'] = None
        
        df['aspect_ratio'] = df.apply(
            lambda row: row['width'] / row['height'] if row['height'] and row['height'] != 0 else None, axis=1
        )
        df['area'] = df['width'] * df['height']
        if total_width:
            df['normalized_width'] = df['width'] / total_width
        else:
            df['normalized_width'] = None
        if total_height:
            df['normalized_height'] = df['height'] / total_height
        else:
            df['normalized_height'] = None


        df = df.drop(columns=['x'])
        df = df.drop(columns=['y'])
        df = df.drop(columns=['x_normalized'])
        df = df.drop(columns=['y_normalized'])
        df = df.drop(columns=['x_center'])
        df = df.drop(columns=['y_center'])


        # Append this batch to the CSV file


        # Compute min and max for each column
        min_max_values = {col: (df[col].min(), df[col].max()) for col in normalize_columns}

        # Apply Min-Max normalization (scaling between 0 and 1)
        for col in normalize_columns:
            min_val, max_val = min_max_values[col]
            if max_val > min_val:  # Avoid division by zero
                df[col] = (df[col] - min_val) / (max_val - min_val)
            else:
                df[col] = 0  # If min and max are the same, set to 0

        
        df.to_csv(output_csv_file, mode='a', header=first_batch, index=False)
        first_batch = False

print(f"Extracted features from all JSON files have been saved to {output_csv_file}")

Processing test_json\converted_output.json...
Extracted features from all JSON files have been saved to figma_dataset.csv
