In [129]:
import os
import json
import pandas as pd

In [None]:
def extract_features(node, depth=0, parent_tag=None, sibling_count=0):
    features = []
    
    tag = node.get("tag", "NONE")
    node_data = node.get("node", {})
    node_type = str(node_data.get("type", "NONE"))

    text = node_data.get("characters", "NONE")
    text_length = len(text) if text != "NONE" else 0
    word_count = len(text.split()) if text != "NONE" else 0
    contains_number = any(ch.isdigit() for ch in text) if text != "NONE" else 0
    contains_special_chars = any(not ch.isalnum() and not ch.isspace() for ch in text) if text != "NONE" else 0
    
    children = node.get("children", [])
    num_children = len(children)
    is_leaf = 1 if num_children == 0 else 0
    
    feature = {
        "tag": tag,
        "type": node_type,
        "x": node_data.get("x", "NONE"),
        "y": node_data.get("y", "NONE"),
        "width": node_data.get("width", "NONE"),
        "height": node_data.get("height", "NONE"),
        "characters": text,
        "depth": depth,
        "num_children": num_children,
        "parent_tag": parent_tag if parent_tag is not None else "NONE",
        "sibling_count": sibling_count,
        "is_leaf": is_leaf,
        "font_size": node_data.get("fontSize", "NONE"),
        "font_weight": node_data.get("fontName", {}).get("style", "NONE") if node_data.get("fontName") else "NONE",
        "color_r": 0, "color_g": 0, "color_b": 0,
        "background_r": 0, "background_g": 0, "background_b": 0,
        "border_radius": "NONE",
        "border_type": "NONE",
        "border_r": 0, "border_g": 0, "border_b": 0,
        "border_opacity": "NONE",
        "border_weight": node_data.get("strokeWeight", "NONE"),
        "shadow_type": "NONE",
        "shadow_r": 0, "shadow_g": 0, "shadow_b": 0,
        "shadow_offset": "NONE",
        "shadow_radius": "NONE",
        "text_length": text_length,
        "word_count": word_count,
        "contains_number": 1 if contains_number else 0,
        "contains_special_chars": 1 if contains_special_chars else 0,
    }
    
    fills = node_data.get("fills", [])
    for fill in fills:
        if fill.get("type") == "SOLID" and "color" in fill:
            feature["color_r"] = int(fill["color"].get("r", 0) * 255)
            feature["color_g"] = int(fill["color"].get("g", 0) * 255)
            feature["color_b"] = int(fill["color"].get("b", 0) * 255)
            feature["background_r"] = feature["color_r"]
            feature["background_g"] = feature["color_g"]
            feature["background_b"] = feature["color_b"]
            break  
    
    strokes = node_data.get("strokes", [])
    if strokes:
        stroke = strokes[0]
        feature["border_type"] = stroke.get("type", "NONE")
        if "color" in stroke:
            feature["border_r"] = int(stroke["color"].get("r", 0) * 255)
            feature["border_g"] = int(stroke["color"].get("g", 0) * 255)
            feature["border_b"] = int(stroke["color"].get("b", 0) * 255)
        feature["border_opacity"] = stroke.get("opacity", "NONE")
    
    br_top_left = node_data.get("topLeftRadius", "NONE")
    br_top_right = node_data.get("topRightRadius", "NONE")
    br_bottom_left = node_data.get("bottomLeftRadius", "NONE")
    br_bottom_right = node_data.get("bottomRightRadius", "NONE")
    if all(v != "NONE" for v in [br_top_left, br_top_right, br_bottom_left, br_bottom_right]):
        feature["border_radius"] = (br_top_left + br_top_right + br_bottom_left + br_bottom_right) / 4
    
    effects = node_data.get("effects", [])
    for effect in effects:
        if effect.get("type") == "DROP_SHADOW":
            feature["shadow_type"] = "DROP_SHADOW"
            if "color" in effect:
                feature["shadow_r"] = int(effect["color"].get("r", 0) * 255)
                feature["shadow_g"] = int(effect["color"].get("g", 0) * 255)
                feature["shadow_b"] = int(effect["color"].get("b", 0) * 255)
            feature["shadow_offset"] = effect.get("offset", "NONE")
            feature["shadow_radius"] = effect.get("radius", "NONE")
            break  
    
    features.append(feature)
    
    for child in children:
        features.extend(extract_features(child, depth=depth+1, parent_tag=node_type, sibling_count=len(children)-1))
    
    return features


In [131]:
# Folder containing JSON files
data_folder = "json_data"
output_csv_file = "features_data.csv"

# If the output CSV exists, remove it so we start fresh
if os.path.exists(output_csv_file):
    os.remove(output_csv_file)

# Flag to write header only for the first batch
first_batch = True

# Iterate over all JSON files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        print(f"Processing {file_path}...")
        
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        # Extract features using the recursive function starting at the root
        features_list = extract_features(data, depth=0, parent_tag=None, sibling_count=0)
        if not features_list:
            continue  # Skip if no features extracted
        
        df = pd.DataFrame(features_list)
        
        # Normalize positions per JSON file to avoid cross-file influence
        min_x = df['x'].min() if df['x'].notnull().any() else 0
        min_y = df['y'].min() if df['y'].notnull().any() else 0
        df['x_normalized'] = df['x'] - min_x
        df['y_normalized'] = df['y'] - min_y
        
        df['x_center'] = df['x'] + df['width'] / 2
        df['y_center'] = df['y'] + df['height'] / 2
        
        # Attempt to compute total dimensions using a BODY tag if available
        body_node = df[df['tag'] == 'BODY']
        if not body_node.empty:
            total_width = body_node.iloc[0]['width']
            total_height = body_node.iloc[0]['height']
        else:
            total_width = (df['x'] + df['width']).max()
            total_height = (df['y'] + df['height']).max()
        
        # Avoid division by zero
        if total_width and total_height:
            df['x_quarter'] = df['x_center'] / total_width
            df['y_quarter'] = df['y_center'] / total_height
        else:
            df['x_quarter'] = None
            df['y_quarter'] = None
        
        df['aspect_ratio'] = df.apply(
            lambda row: row['width'] / row['height'] if row['height'] and row['height'] != 0 else None, axis=1
        )
        df['area'] = df['width'] * df['height']
        if total_width:
            df['normalized_width'] = df['width'] / total_width
        else:
            df['normalized_width'] = None
        if total_height:
            df['normalized_height'] = df['height'] / total_height
        else:
            df['normalized_height'] = None


        df = df.drop(columns=['x'])
        df = df.drop(columns=['y'])
        df = df.drop(columns=['x_normalized'])
        df = df.drop(columns=['y_normalized'])
        df = df.drop(columns=['x_center'])
        df = df.drop(columns=['y_center'])


        # Append this batch to the CSV file
        
        df.to_csv(output_csv_file, mode='a', header=first_batch, index=False)
        first_batch = False

print(f"Extracted features from all JSON files have been saved to {output_csv_file}")

Processing json_data\figmaTree_4.json...
Extracted features from all JSON files have been saved to features_data.csv
