In [92]:
import os
import json
import csv
import pandas as pd
import ast  # To safely evaluate the string into a list of tuples


In [93]:
def normalize_percentages(position_array):
    """
    Normalize an array by finding the max value in the tuples,
    subtracting it from 100, and then subtracting that number from all values.
    
    Args:
    position_array (list or str): List of tuples or string representation of list
    
    Returns:
    list: Normalized array of percentage tuples
    """
    # If the array is empty, return it as is
    if not position_array:
        return position_array
    
    # Convert string representation to list if needed
    if isinstance(position_array, str):
        try:
            position_array = eval(position_array)
        except:
            return position_array
    
    # Find the max value across all tuples
    max_value = max(max(tuple_item) for tuple_item in position_array)
    if max_value == 0:
        max_value = 100

    if max_value >= 100:
        scaling_factor = 100 / max_value

        # Normalize each tuple by subtracting the calculated value
        position_array = [
            (round(item[0] * scaling_factor, 2), round(item[1] * scaling_factor, 2)) 
            for item in position_array
        ]
    
    return position_array

In [94]:
def extract_node_details(node):
    """
    Extract details including child positions and flex direction.
    
    Args:
    node (dict): A JSON node to process
    Returns:
    dict: A dictionary containing extracted node details
    """
    # Extract child positions
    def extract_child_positions(node):
        # If no children, return empty lists
        if 'children' not in node or not node['children']:
            return {
                'x_positions_percent': [],
                'y_positions_percent': []
            }
        
        # Get parent dimensions
        parent_width = node.get('node', {}).get('width', 1)
        parent_height = node.get('node', {}).get('height', 1)
        if parent_width == 0:
            parent_width = 1
        if parent_height == 0:
            parent_height = 1  
        # Extract x and y positions for children
        x_positions = []
        y_positions = []
        
        for child in node['children']:
            child_node = child.get('node', {})
            
            # Calculate x position percentages
            x = child_node.get('x', 0)
            width = child_node.get('width', 0)
            x_start_percent = (x / parent_width) * 100
            x_end_percent = ((x + width) / parent_width) * 100
            x_positions.append((round(x_start_percent, 2), round(x_end_percent, 2)))
            
            # Calculate y position percentages
            y = child_node.get('y', 0)
            height = child_node.get('height', 0)
            y_start_percent = (y / parent_height) * 100
            y_end_percent = ((y + height) / parent_height) * 100
            y_positions.append((round(y_start_percent, 2), round(y_end_percent, 2)))
        
        normalized_x_positions = []
        normalized_y_positions = []

        if x_positions != []:
            normalized_x_positions = normalize_percentages(x_positions)
        if y_positions != []:    
            normalized_y_positions = normalize_percentages(y_positions)
        return {
            'x_positions_percent': normalized_x_positions,
            'y_positions_percent': normalized_y_positions
        }
    
    # Extract flex direction
    def extract_flex_direction(node):
        fills = node.get('node', {}).get('fills', [])
        if fills and len(fills) > 0:
            return fills[0].get('flexDirection', 'row')
        return 'row'

    # Prepare the result dictionary
    result = {
        'tag': node.get('tag', 'Unknown'),
        'x': node.get('node', {}).get('x', 'N/A'),
        'y': node.get('node', {}).get('y', 'N/A'),
        'width': node.get('node', {}).get('width', 'N/A'),
        'height': node.get('node', {}).get('height', 'N/A'),
        'flex_direction': extract_flex_direction(node)
    }
    
    # Add child positions
    child_positions = extract_child_positions(node)
    result.update({
        'x_positions_percent': str(child_positions['x_positions_percent']),
        'y_positions_percent': str(child_positions['y_positions_percent'])
    })
    
    return result

In [95]:
def parse_and_validate_positions(positions):
    try:
        # Convert string to list of tuples
        positions = ast.literal_eval(positions) if isinstance(positions, str) else positions
        
        if isinstance(positions, list) and len(positions) > 0 and all(
                isinstance(t, tuple) and len(t) == 2 and min(t) >= 0 for t in positions):
            return sorted(positions)  # Sort the tuples
        return None  # Invalid list
    except (ValueError, SyntaxError):
        return None  # Invalid format

In [96]:

def process_json_files(input_folder, output_csv):
    """
    Process all JSON files in the input folder and extract features.
    
    Args:
    input_folder (str): Path to folder containing JSON files
    output_csv (str): Path to output CSV file
    """
    # List to store results
    results = []
    
    # Iterate through JSON files in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            file_path = os.path.join(input_folder, filename)
            
            # Read JSON file
            with open(file_path, 'r', encoding="utf-8") as f:
                data = json.load(f)
            
            # Extract features recursively
            def process_node(node):
                # Extract node details
                node_details = extract_node_details(node)
                
                
                # Add to results
                result_entry = {
                    'tag': node_details['tag'],
                    'x': node_details['x'],
                    'y': node_details['y'],
                    'width': node_details['width'],
                    'height': node_details['height'],
                    'flex_direction': node_details['flex_direction'],
                    'x_positions_percent': node_details['x_positions_percent'],
                    'y_positions_percent': node_details['y_positions_percent']
                }
                results.append(result_entry)
                
                # Recursively process children
                if 'children' in node:
                    for i, child in enumerate(node['children']):
                        process_node(child)
            
            # Start processing from the root
            process_node(data)
    
    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(results)


    numeric_columns = ["width", "height", "x", "y"]
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert invalid values to NaN

    # Convert and filter
    df["x_positions_percent"] = df["x_positions_percent"].apply(parse_and_validate_positions)
    df["y_positions_percent"] = df["y_positions_percent"].apply(parse_and_validate_positions)
    
    # Drop invalid rows
    df = df[
        (df["width"] >= 1) &
        (df["height"] >= 1) &
        (df["x"] >= -1) &
        (df["y"] >= -1) &
        (df["x_positions_percent"].notnull()) &
        (df["y_positions_percent"].notnull())
    ]

    
    df.to_csv(output_csv, index=False)
    print(f"Features extracted and saved to {output_csv}")

In [97]:
input_folder = "../../json_data"  # Replace with your folder path
output_csv = "styling_features.csv"
process_json_files(input_folder, output_csv)

Features extracted and saved to styling_features.csv
