In [None]:
# import json
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import DBSCAN
# from sentence_transformers import SentenceTransformer
# import hashlib
# from typing import Dict, List, Any, Tuple
# from dataclasses import dataclass
# import logging

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# @dataclass
# class NodeFeatures:
#     """Data class to hold extracted features from a Figma node"""
#     structural_features: Dict[str, Any]
#     style_features: Dict[str, Any]
#     content_features: Dict[str, Any]
#     semantic_features: List[float]

# class FigmaNodeSimilarityDetector:
#     """
#     A flexible similarity detection system for Figma nodes using multiple ML approaches.
#     Designed to be easily extensible with different similarity algorithms.
#     """
    
#     def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
#         """
#         Initialize the similarity detector
        
#         Args:
#             similarity_threshold: Threshold for considering nodes similar (0-1)
#             use_semantic_embeddings: Whether to use semantic embeddings for text content
#         """
#         self.similarity_threshold = similarity_threshold
#         self.use_semantic_embeddings = use_semantic_embeddings
#         self.node_features = {}
#         self.similarity_matrix = None
#         self.clusters = None
        
#         # Initialize semantic model if needed
#         if use_semantic_embeddings:
#             try:
#                 self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
#                 logger.info("Loaded semantic embedding model")
#             except Exception as e:
#                 logger.warning(f"Could not load semantic model: {e}")
#                 self.use_semantic_embeddings = False
    
#     def extract_node_features(self, node_data: Dict[str, Any], node_path: str = "") -> NodeFeatures:
#         """
#         Extract comprehensive features from a Figma node
        
#         Args:
#             node_data: The node data dictionary
#             node_path: Path to the node in the tree
            
#         Returns:
#             NodeFeatures object containing all extracted features
#         """
#         node_info = node_data.get('node', {})
        
#         # 1. Structural Features
#         structural_features = {
#             'type': node_info.get('type', ''),
#             'tag': node_data.get('tag', ''),
#             'has_children': len(node_data.get('children', [])) > 0,
#             'num_children': len(node_data.get('children', [])),
#             # 'width': node_info.get('width', 0),
#             # 'height': node_info.get('height', 0),
#             # 'x': node_info.get('x', 0),
#             # 'y': node_info.get('y', 0),
#         }
        
#         # 2. Style Features
#         style_features = {
#             'font_family': node_info.get('fontName', {}).get('family', ''),
#             'font_style': node_info.get('fontName', {}).get('style', ''),
#             'font_size': node_info.get('fontSize', 0),
#             'flex_direction': node_info.get('flexDirection', ''),
#             'stroke_weight': node_info.get('StrokeWeight', 0),
#             'border_radius': {
#                 'top_left': node_info.get('topLeftRadius', 0),
#                 'top_right': node_info.get('topRightRadius', 0),
#                 'bottom_left': node_info.get('bottomLeftRadius', 0),
#                 'bottom_right': node_info.get('bottomRightRadius', 0),
#             }
#         }
        
#         # Extract fill colors
#         fills = node_info.get('fills', [])
#         if fills:
#             primary_fill = fills[0]
#             color = primary_fill.get('color', {})
#             style_features.update({
#                 'fill_type': primary_fill.get('type', ''),
#                 'fill_color_r': color.get('r', 0),
#                 'fill_color_g': color.get('g', 0),
#                 'fill_color_b': color.get('b', 0),
#                 'fill_color_a': color.get('a', 1),
#             })
        
#         # Extract stroke colors
#         strokes = node_info.get('strokes', [])
#         if strokes:
#             primary_stroke = strokes[0]
#             stroke_color = primary_stroke.get('color', {})
#             style_features.update({
#                 'stroke_type': primary_stroke.get('type', ''),
#                 'stroke_color_r': stroke_color.get('r', 0),
#                 'stroke_color_g': stroke_color.get('g', 0),
#                 'stroke_color_b': stroke_color.get('b', 0),
#                 'stroke_color_a': stroke_color.get('a', 1),
#             })
        
#         # 3. Content Features
#         content_features = {
#             'text_content': node_info.get('characters', ''),
#             'name': node_data.get('name', ''),
#             'has_text': bool(node_info.get('characters', '')),
#             # 'text_length': len(node_info.get('characters', '')),
#         }
        
#         # 4. Semantic Features (using embeddings)
#         semantic_features = []
#         if self.use_semantic_embeddings:
#             text_for_embedding = f"{content_features['name']} {content_features['text_content']} {structural_features['tag']} {structural_features['type']}"
#             if text_for_embedding.strip():
#                 try:
#                     semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
#                 except:
#                     semantic_features = [0.0] * 384  # Default embedding size
        
#         return NodeFeatures(
#             structural_features=structural_features,
#             style_features=style_features,
#             content_features=content_features,
#             semantic_features=semantic_features
#         )
    
#     def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
#         """
#         Build similarity matrix between all nodes.
#         This method can be easily modified to try different similarity approaches.
        
#         Args:
#             nodes_data: Dictionary containing all node data
            
#         Returns:
#             Similarity matrix as numpy array
#         """
#         # Extract features for all nodes
#         all_nodes = self._flatten_nodes(nodes_data)
#         self.node_features = {}
        
#         for i, (node_path, node_data) in enumerate(all_nodes):
#             features = self.extract_node_features(node_data, node_path)
#             self.node_features[node_path] = features
        
#         # Create feature vectors for similarity calculation
#         feature_vectors = []
#         self.node_paths = list(self.node_features.keys())
        
#         for node_path in self.node_paths:
#             features = self.node_features[node_path]
#             vector = self._create_feature_vector(features)
#             feature_vectors.append(vector)
        
#         feature_vectors = np.array(feature_vectors)
        
#         # Calculate similarity matrix using cosine similarity
#         self.similarity_matrix = cosine_similarity(feature_vectors)
        
#         return self.similarity_matrix
    
#     def _create_feature_vector(self, features: NodeFeatures) -> np.ndarray:
#         """
#         Create a numerical feature vector from NodeFeatures.
#         This method can be modified to emphasize different aspects.
        
#         Args:
#             features: NodeFeatures object
            
#         Returns:
#             Numerical feature vector
#         """
#         vector = []
        
#         # Structural features (normalized)
#         struct = features.structural_features
#         vector.extend([
#             hash(struct['type']) % 1000 / 1000.0,  # Normalize hash
#             hash(struct['tag']) % 1000 / 1000.0,
#             float(struct['has_children']),
#             min(struct['num_children'] / 10.0, 1.0),  # Normalize to 0-1
#             # min(struct['width'] / 1000.0, 1.0),
#             # min(struct['height'] / 1000.0, 1.0),
#         ])
        
#         # Style features
#         style = features.style_features
#         vector.extend([
#             hash(style['font_family']) % 1000 / 1000.0 if style['font_family'] else 0,
#             min(style['font_size'] / 100.0, 1.0) if style['font_size'] else 0,
#             style.get('fill_color_r', 0),
#             style.get('fill_color_g', 0),
#             style.get('fill_color_b', 0),
#             style.get('fill_color_a', 1),
#         ])
        
#         # Content features
#         content = features.content_features
#         vector.extend([
#             float(content['has_text']),
#             # min(content['text_length'] / 100.0, 1.0),
#         ])
        
#         # Semantic features (if available)
#         if features.semantic_features:
#             # Use first 20 dimensions to keep vector manageable
#             vector.extend(features.semantic_features[:384])
#         else:
#             vector.extend([0.0] * 384)
        
#         return np.array(vector)
    
#     def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
#         """
#         Check similarity between nodes and return groups of similar nodes.
#         This method can be easily modified to use different clustering approaches.
        
#         Args:
#             threshold: Similarity threshold (uses instance threshold if None)
            
#         Returns:
#             Dictionary mapping group_id to list of similar node paths
#         """
#         if self.similarity_matrix is None:
#             raise ValueError("Must build similarity matrix first")
        
#         threshold = threshold or self.similarity_threshold
        
#         # Method 1: Simple threshold-based grouping
#         similarity_groups = self._threshold_based_grouping(threshold)
        
#         # Method 2: DBSCAN clustering (alternative approach)
#         # similarity_groups = self._dbscan_clustering()
        
#         return similarity_groups
    
#     def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
#         """Threshold-based similarity grouping"""
#         groups = {}
#         assigned = set()
#         group_counter = 0
        
#         for i in range(len(self.node_paths)):
#             if self.node_paths[i] in assigned:
#                 continue
                
#             # Find all nodes similar to current node
#             similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
#             similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
#             if similar_nodes:  # If we found similar nodes
#                 group_id = f"group_{group_counter}"
#                 groups[group_id] = [self.node_paths[i]] + similar_nodes
#                 assigned.update(groups[group_id])
#                 group_counter += 1
        
#         return groups
    
#     def _dbscan_clustering(self, eps: float = 0.3, min_samples: int = 2) -> Dict[str, List[str]]:
#         """DBSCAN-based clustering (alternative approach)"""
#         # Convert similarity to distance matrix
#         distance_matrix = 1 - self.similarity_matrix
        
#         clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
#         cluster_labels = clustering.fit_predict(distance_matrix)
        
#         groups = {}
#         for i, label in enumerate(cluster_labels):
#             if label != -1:  # -1 is noise in DBSCAN
#                 group_id = f"cluster_{label}"
#                 if group_id not in groups:
#                     groups[group_id] = []
#                 groups[group_id].append(self.node_paths[i])
        
#         return groups
    
#     def _flatten_nodes(self, data: Dict[str, Any], path: str = "") -> List[Tuple[str, Dict[str, Any]]]:
#         """Recursively flatten the node tree into a list"""
#         nodes = []
        
#         if isinstance(data, dict):
#             if 'node' in data:  # This is a node
#                 current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                 nodes.append((current_path, data))
            
#             if 'children' in data:
#                 for i, child in enumerate(data['children']):
#                     child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                     nodes.extend(self._flatten_nodes(child, child_path))
        
#         return nodes
    
#     def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
#         """
#         Add node_id to each node in the original JSON based on similarity groups
        
#         Args:
#             original_data: Original JSON data
#             similarity_groups: Groups of similar nodes
            
#         Returns:
#             Modified JSON with node_id added to each node
#         """
#         # Create mapping from node path to group id
#         path_to_group = {}
#         for group_id, node_paths in similarity_groups.items():
#             for node_path in node_paths:
#                 path_to_group[node_path] = group_id
        
#         # Add node_ids recursively
#         modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
#         return modified_data
    
#     def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
#         """Recursively add node_ids to the JSON structure"""
#         if isinstance(data, dict):
#             result = data.copy()
            
#             if 'node' in data:  # This is a node
#                 current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
#                 # Add node_id if this node is in a similarity group
#                 if current_path in path_to_group:
#                     result['node_id'] = path_to_group[current_path]
#                 else:
#                     # Generate unique ID for non-grouped nodes
#                     result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
#             if 'children' in data:
#                 child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                 result['children'] = [
#                     self._add_node_ids_recursive(child, path_to_group, child_path)
#                     for child in data['children']
#                 ]
            
#             return result
        
#         return data

# def main():
#     """Example usage of the FigmaNodeSimilarityDetector"""
    
#     # Example JSON data (your provided data)
#     sample_data = {
#         "children": [
#             {
#                 "children": [
#                     {
#                         "children": [
#                             {
#                                 "children": [],
#                                 "name": "ICON",
#                                 "node": {
#                                     "type": "ELLIPSE",
#                                     "width": 32.0,
#                                     "height": 32.0,
#                                     "fills": [{"type": "SOLID", "color": {"r": 0.77, "g": 0.77, "b": 0.77, "a": 1.0}}]
#                                 },
#                                 "tag": "ICON"
#                             }
#                         ]
#                     }
#                 ]
#             }
#         ]
#     }
    
#     # Initialize detector
#     detector = FigmaNodeSimilarityDetector(
#         similarity_threshold=0.8,
#         use_semantic_embeddings=True
#     )
    
#     try:
#         # Build similarity matrix
#         logger.info("Building similarity matrix...")
#         similarity_matrix = detector.build_similarity_matrix(sample_data)
#         logger.info(f"Similarity matrix shape: {similarity_matrix.shape}")
        
#         # Check for similar nodes
#         logger.info("Checking for similar nodes...")
#         similarity_groups = detector.check_similarity()
#         logger.info(f"Found {len(similarity_groups)} similarity groups")
        
#         # Add node IDs to original JSON
#         logger.info("Adding node IDs to JSON...")
#         result_json = detector.add_node_ids_to_json(sample_data, similarity_groups)
        
#         # Print results
#         print("Similarity Groups:")
#         for group_id, nodes in similarity_groups.items():
#             print(f"  {group_id}: {nodes}")
        
#         print("\nJSON with node_ids:")
#         print(json.dumps(result_json, indent=2))
        
#     except Exception as e:
#         logger.error(f"Error in main execution: {e}")
#         # Fallback without semantic embeddings
#         detector_simple = FigmaNodeSimilarityDetector(
#             similarity_threshold=0.8,
#             use_semantic_embeddings=False
#         )
#         similarity_matrix = detector_simple.build_similarity_matrix(sample_data)
#         similarity_groups = detector_simple.check_similarity()
#         result_json = detector_simple.add_node_ids_to_json(sample_data, similarity_groups)
#         print(json.dumps(result_json, indent=2))

# if __name__ == "__main__":
#     main()

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import hashlib
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FigmaNodeSimilarityDetector:
    """
    A flexible similarity detection system for Figma nodes using bottom-up approach.
    Feature vectors are extracted directly from leaf nodes and propagated upward.
    """
    
    def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
        """
        Initialize the similarity detector
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            use_semantic_embeddings: Whether to use semantic embeddings for text content
        """
        self.similarity_threshold = similarity_threshold
        self.use_semantic_embeddings = use_semantic_embeddings
        self.node_feature_vectors = {}  # Store feature vectors directly
        self.node_metadata = {}  # Store metadata (is_leaf, etc.)
        self.similarity_matrix = None
        self.clusters = None
        self.node_tree = {}  # Store hierarchical structure
        
        # Initialize semantic model if needed
        if use_semantic_embeddings:
            try:
                self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
                logger.info("Loaded semantic embedding model")
            except Exception as e:
                logger.warning(f"Could not load semantic model: {e}")
                self.use_semantic_embeddings = False
    
    def extract_leaf_node_feature_vector(self, node_data: Dict[str, Any], node_path: str = "") -> np.ndarray:
        """
        Extract feature vector directly from a leaf Figma node

        Args:
            node_data: The node data dictionary
            node_path: Path to the node in the tree

        Returns:
            Numpy array containing the feature vector
        """
        node_info = node_data.get('node', {})
        tag = node_data.get('tag', '')
        
        # Check if tag is ICON or SVG - if so, return default vector of ones
        if tag in ["ICON", "SVG"]:
            # Create default vector with ones
            default_size = 17  # structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)
        
        vector = []

        # 1. Structural Features (normalized)
        node_type = node_info.get('type', '')
        has_children = len(node_data.get('children', [])) > 0
        num_children = len(node_data.get('children', []))

        vector.extend([
            hash(node_type) % 1000 / 1000.0,  # Normalize hash
            # hash(tag) % 1000 / 1000.0,
            float(has_children),
            min(num_children / 10000.0, 1.0),  # Normalize to 0-1
        ])

        # 2. Style Features
        # Extract font information from textStyle
        text_style = node_info.get('textStyle', {})
        font_family = text_style.get('fontFamily', '')
        font_size = text_style.get('fontSize', 0)
        font_style = text_style.get('fontStyle', '')
        font_weight = text_style.get('fontWeight', 0)

        # Extract fill colors
        fills = node_info.get('fills', [])
        fill_r = fill_g = fill_b = fill_a = 0
        if fills:
            primary_fill = fills[0]
            color = primary_fill.get('color', {})
            fill_r = color.get('r', 0)
            fill_g = color.get('g', 0)
            fill_b = color.get('b', 0)
            fill_a = color.get('a', 1)

        # Extract stroke colors
        strokes = node_info.get('strokes', [])
        stroke_r = stroke_g = stroke_b = stroke_a = 0
        if strokes:
            primary_stroke = strokes[0]
            stroke_color = primary_stroke.get('color', {})
            stroke_r = stroke_color.get('r', 0)
            stroke_g = stroke_color.get('g', 0)
            stroke_b = stroke_color.get('b', 0)
            stroke_a = stroke_color.get('a', 1)

        vector.extend([
            hash(font_family) % 1000 / 1000.0 if font_family else 0,
            min(font_size / 10000.0, 1.0) if font_size else 0,  # Normalize font size to 0-1 (assuming max ~100px)
            hash(font_style) % 1000 / 1000.0 if font_style else 0,
            min(font_weight / 10000.0, 1.0) if font_weight else 0,  # Normalize font weight (max ~900-1000)
            fill_r, fill_g, fill_b, fill_a,
            stroke_r, stroke_g, stroke_b, stroke_a,
        ])

        # 3. Content Features
        text_content = node_info.get('characters', '')
        name = node_data.get('name', '')
        has_text = bool(text_content)

        vector.extend([
            float(has_text),
        ])

        # 4. Semantic Features (using embeddings) - commented out for now
        # if self.use_semantic_embeddings:
        #     text_for_embedding = f"{name} {node_type}"
        #     if text_for_embedding.strip():
        #         try:
        #             semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
        #             vector.extend(semantic_features)
        #         except:
        #             vector.extend([0.0] * 384)  # Default embedding size
        #     else:
        #         vector.extend([0.0] * 384)
        # else:
        #     vector.extend([0.0] * 384)

        # 5. Descendant Count Feature (for leaf nodes, this is 0)
        total_descendants = 0  # Leaf nodes have no descendants
        vector.extend([
            total_descendants
        ])

        return np.array(vector)

    def create_parent_feature_vector(self, children_vectors: List[np.ndarray], node_data: Dict[str, Any]) -> np.ndarray:
        """
        Create parent node feature vector by averaging children feature vectors

        Args:
            children_vectors: List of children feature vectors
            node_data: Parent node data

        Returns:
            Feature vector for parent node
        """
        tag = node_data.get('tag', '')
        
        # Check if tag is ICON or SVG - if so, return default vector of ones
        if tag in ["ICON", "SVG"]:
            # Create default vector with ones
            default_size = 17  # structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)
        
        if not children_vectors:
            # If no children vectors, create a default vector
            default_size = 0 + 17  # Updated size: structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)

        # Average all children feature vectors
        parent_vector = np.mean(children_vectors, axis=0)

        # Override some structural features specific to the parent
        node_info = node_data.get('node', {})
        node_type = node_info.get('type', '')
        num_children = len(children_vectors)

        # Update structural features (first 3 elements)
        parent_vector[0] = hash(node_type) % 1000 / 1000.0
        # parent_vector[1] = hash(tag) % 1000 / 1000.0
        parent_vector[1] = 1.0  # Parent always has children
        parent_vector[2] = min(num_children / 10000.0, 1.0)

        # Calculate total descendants
        total_descendants = num_children
        for child_vector in children_vectors:
            # The descendant count is the last feature in the vector
            child_descendants = int(child_vector[-1] * 10000)  # Denormalize
            total_descendants += child_descendants

        # Update the descendant count feature (last element)
        parent_vector[-1] = min(total_descendants / 10000.0, 1.0)

        return parent_vector

    def get_total_descendants_count(self, node_vector: np.ndarray) -> int:
        """
        Extract the total descendants count from a node's feature vector

        Args:
            node_vector: The feature vector of a node

        Returns:
            Total number of descendants for this node
        """
        # The descendant count is the last feature, denormalized
        return int(node_vector[-1] * 10000)
    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """
        Build similarity matrix using bottom-up approach.
        Feature vectors are extracted from leaves and propagated upward.
        
        Args:
            nodes_data: Dictionary containing all node data
            
        Returns:
            Similarity matrix as numpy array
        """
        # Step 1: Build the tree structure and identify leaf nodes
        self.node_tree = self._build_node_tree(nodes_data)
        
        # Step 2: Extract feature vectors bottom-up
        self.node_feature_vectors = {}
        self.node_metadata = {}
        self._extract_feature_vectors_bottom_up(nodes_data)
        
        # Step 3: Filter out leaf nodes for similarity calculation (only compare non-leaf nodes)
        non_leaf_paths = [path for path, metadata in self.node_metadata.items() if not metadata['is_leaf']]
        self.node_paths = non_leaf_paths
        
        if not self.node_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 4: Get feature vectors for non-leaf nodes only
        feature_vectors = []
        for node_path in self.node_paths:
            vector = self.node_feature_vectors[node_path]
            feature_vectors.append(vector)
        
        feature_vectors = np.array(feature_vectors)
        
        # Step 5: Calculate similarity matrix using cosine similarity
        self.similarity_matrix = cosine_similarity(feature_vectors)
        
        return self.similarity_matrix
    
    def _build_node_tree(self, data: Dict[str, Any], path: str = "", parent_path: str = None) -> Dict[str, Dict]:
        """Build a tree structure mapping node paths to their metadata"""
        tree = {}
        
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                tree[current_path] = {
                    'data': data,
                    'parent': parent_path,
                    'children': [],
                    'is_leaf': len(data.get('children', [])) == 0
                }
                
                if 'children' in data:
                    for child in data['children']:
                        child_tree = self._build_node_tree(child, current_path, current_path)
                        tree.update(child_tree)
                        # Add child paths to current node
                        for child_path in child_tree.keys():
                            if child_tree[child_path]['parent'] == current_path:
                                tree[current_path]['children'].append(child_path)
        
        return tree
    
    def _extract_feature_vectors_bottom_up(self, data: Dict[str, Any], path: str = ""):
        """Extract feature vectors using bottom-up approach"""
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # First, process all children
                children_vectors = []
                if 'children' in data:
                    for child in data['children']:
                        child_path = f"{current_path}"
                        self._extract_feature_vectors_bottom_up(child, child_path)
                        
                        # Get child path and vector
                        child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                        if child_node_path in self.node_feature_vectors:
                            children_vectors.append(self.node_feature_vectors[child_node_path])
                
                # Extract feature vector for current node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:  # Leaf node
                    self.node_feature_vectors[current_path] = self.extract_leaf_node_feature_vector(data, current_path)
                else:  # Parent node - aggregate children vectors
                    self.node_feature_vectors[current_path] = self.create_parent_feature_vector(children_vectors, data)
                
                # Store metadata
                self.node_metadata[current_path] = {
                    'is_leaf': is_leaf,
                    'num_children': len(children_vectors),
                    'node_type': data.get('node', {}).get('type', ''),
                    'name': data.get('name', '')
                }
    
    def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
        """
        Check similarity between non-leaf nodes and return groups of similar nodes.
        Leaf nodes are excluded from grouping.
        
        Args:
            threshold: Similarity threshold (uses instance threshold if None)
            
        Returns:
            Dictionary mapping group_id to list of similar node paths (non-leaf nodes only)
        """
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")
        
        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}
        
        threshold = threshold or self.similarity_threshold
        
        # Method 1: Simple threshold-based grouping (only for non-leaf nodes)
        similarity_groups = self._threshold_based_grouping(threshold)
        
        return similarity_groups
    
    def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
        """Threshold-based similarity grouping for non-leaf nodes only"""
        groups = {}
        assigned = set()
        group_counter = 0
        
        for i in range(len(self.node_paths)):
            if self.node_paths[i] in assigned:
                continue
                
            # Find all nodes similar to current node
            similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
            similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
            if similar_nodes:  # If we found similar nodes
                group_id = f"group_{group_counter}"
                groups[group_id] = [self.node_paths[i]] + similar_nodes
                assigned.update(groups[group_id])
                group_counter += 1
        
        return groups
    
    def _dbscan_clustering(self, eps: float = 0.3, min_samples: int = 2) -> Dict[str, List[str]]:
        """DBSCAN-based clustering (alternative approach) for non-leaf nodes only"""
        # Convert similarity to distance matrix
        distance_matrix = 1 - self.similarity_matrix
        
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        cluster_labels = clustering.fit_predict(distance_matrix)
        
        groups = {}
        for i, label in enumerate(cluster_labels):
            if label != -1:  # -1 is noise in DBSCAN
                group_id = f"cluster_{label}"
                if group_id not in groups:
                    groups[group_id] = []
                groups[group_id].append(self.node_paths[i])
        
        return groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """
        Add node_id to each node in the original JSON based on similarity groups.
        Only non-leaf nodes get group IDs, leaf nodes get unique IDs.
        
        Args:
            original_data: Original JSON data
            similarity_groups: Groups of similar nodes (non-leaf only)
            
        Returns:
            Modified JSON with node_id added to each node
        """
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
        return modified_data
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:
                    # Leaf nodes get unique IDs
                    result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data:
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data

    def get_feature_vector_info(self) -> Dict[str, Any]:
        """
        Get information about the feature vectors for debugging/analysis
        
        Returns:
            Dictionary containing feature vector statistics
        """
        if not self.node_feature_vectors:
            return {"error": "No feature vectors extracted yet"}
        
        vector_lengths = [len(v) for v in self.node_feature_vectors.values()]
        leaf_count = sum(1 for metadata in self.node_metadata.values() if metadata['is_leaf'])
        non_leaf_count = len(self.node_metadata) - leaf_count
        
        return {
            "total_nodes": len(self.node_feature_vectors),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "feature_vector_length": vector_lengths[0] if vector_lengths else 0,
            "feature_breakdown": {
                "structural_features": 3,
                "style_features": 12,  # Updated: font_family, font_size, font_style, font_weight + colors
                "content_features": 1,
                "descendant_count": 1,
                "semantic_features": 384 if self.use_semantic_embeddings else 0
            }
        }

    def print_figma_tree_with_vectors(self, node, depth=0, path=""):
        """
        Print the Figma node tree with feature vectors
        
        Args:
            node: The current node to print
            depth: Current depth in the tree
            path: Current path to the node
        """
        indent = "  " * depth  # 2 spaces per level

        # Extract info
        name = node.get("name", "[no name]")
        tag = node.get("tag", "[no tag]")
        node_id = node.get("node_id", "")
        
        # Handle TEXT nodes with characters
        node_data = node.get("node", {})
        characters = node_data.get("characters", "")
        is_text = tag == "TEXT"
        display_name = characters[:10] + "..." if is_text and characters else name

        # Layout info (if present)
        layout = node_data.get("layoutMode", "NONE")
        layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

        # Get the current node path
        current_path = f"{path}/{name}" if path else name
        
        # Get feature vector if available
        vector_str = ""
        if current_path in self.node_feature_vectors:
            vector = self.node_feature_vectors[current_path]
            # Format vector to show first few values and some key features
        if len(vector) > 0:
            vector_str = f" | Vector: [{', '.join(f'{v:.7f}' for v in vector)}] (len={len(vector)})"
        else:
            vector_str = " | Vector: Not found"

        # Print current node info with feature vector
        print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id}){vector_str}")

        # Recursively print children
        for child in node.get("children", []):
            self.print_figma_tree_with_vectors(child, depth + 1, current_path)



            

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loaded semantic embedding model


{'depth5_group0': ['name0/name1/name2/name3/name5/name6', 'name0/name1/name2/name3/name5/name8', 'name0/name1/name2/name3/name5/name10', 'name0/name1/name2/name3/name5/name12'], 'depth5_group1': ['name0/name101/name102/name103/name104/name105', 'name0/name101/name149/name150/name151/name152'], 'depth5_group2': ['name0/name101/name102/name103/name104/name107', 'name0/name101/name149/name150/name151/name154'], 'depth5_group3': ['name0/name101/name102/name114/name115/name116', 'name0/name101/name149/name161/name162/name163'], 'depth5_group4': ['name0/name101/name102/name114/name115/name124', 'name0/name101/name149/name161/name162/name171'], 'depth5_group5': ['name0/name101/name102/name114/name132/name133', 'name0/name101/name102/name114/name132/name137', 'name0/name101/name102/name114/name132/name141', 'name0/name101/name149/name161/name179/name180', 'name0/name101/name149/name161/name179/name184', 'name0/name101/name149/name161/name179/name188'], 'depth5_group6': ['name0/name290/name300/

In [3]:
import json

def rename_names_and_store_mapping(node, counter, mapping):
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

# Load input JSON
with open("PAGE_109.json", "r") as f:
    data = json.load(f)

# Rename names and save mapping
name_mapping = {}
rename_names_and_store_mapping(data, [0], name_mapping)

# Save modified JSON and mapping
with open("modified.json", "w") as f:
    json.dump(data, f, indent=2)

with open("name_mapping.json", "w") as f:
    json.dump(name_mapping, f, indent=2)


In [4]:
# Initialize the detector
detector = FigmaNodeSimilarityDetector(
    similarity_threshold=0.99999999999,
    use_semantic_embeddings=True
)

# Load your JSON data
with open('modified.json', 'r') as f:
    figma_data = json.load(f)

# Build similarity matrix
similarity_matrix = detector.build_similarity_matrix(figma_data)

# Find similar nodes
similarity_groups = detector.check_similarity()
print(similarity_groups)
# Add node_ids to original JSON
result_json = detector.add_node_ids_to_json(figma_data, similarity_groups)


with open("modified.json", "w") as f:
    json.dump(result_json, f, indent=2)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loaded semantic embedding model


{'group_0': ['name0/name1/name2/name3/name5/name6', 'name0/name1/name2/name3/name5/name8', 'name0/name1/name2/name3/name5/name10', 'name0/name1/name2/name3/name5/name12'], 'group_1': ['name0/name1/name2/name19/name20/name24/name25', 'name0/name101/name102/name103/name104/name105', 'name0/name101/name149/name150/name151/name152'], 'group_2': ['name0/name53/name54/name58', 'name0/name83/name89/name93'], 'group_3': ['name0/name53/name54', 'name0/name83/name89'], 'group_4': ['name0/name53/name62/name63', 'name0/name53/name62/name72'], 'group_5': ['name0/name97', 'name0/name196'], 'group_6': ['name0/name101/name102/name103/name104/name107', 'name0/name101/name149/name150/name151/name154'], 'group_7': ['name0/name101/name102/name103/name104', 'name0/name101/name149/name150/name151'], 'group_8': ['name0/name101/name102/name103/name109', 'name0/name101/name149/name150/name156'], 'group_9': ['name0/name101/name102/name103/name111', 'name0/name101/name149/name150/name158'], 'group_10': ['name0/n

In [2]:
def restore_names_from_mapping(node, mapping):
    if isinstance(node, dict):
        if "name" in node and node["name"] in mapping:
            node["name"] = mapping[node["name"]]
        for key in node:
            restore_names_from_mapping(node[key], mapping)
    elif isinstance(node, list):
        for item in node:
            restore_names_from_mapping(item, mapping)

# Load modified JSON and name mapping
with open("modified.json", "r") as f:
    modified_data = json.load(f)

with open("name_mapping.json", "r") as f:
    name_mapping = json.load(f)

# Restore original names
restore_names_from_mapping(modified_data, name_mapping)

# Save restored JSON
with open("restored.json", "w") as f:
    json.dump(modified_data, f, indent=2)


In [3]:
detector.print_figma_tree_with_vectors(result_json)

- name0 [DIV] -> name0 NONE (unique_ad6f6c09) | Vector: [0.2510000, 1.0000000, 0.0010000, 0.0100458, 0.0011936, 0.5014654, 0.0374271, 0.2953886, 0.3083035, 0.3189898, 0.7160590, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.6278646, 0.0234000] (len=17)
  - name1 [DIV] -> name1 NONE (unique_917b8603) | Vector: [0.2510000, 1.0000000, 0.0002000, 0.0056667, 0.0007417, 0.3272917, 0.0238542, 0.2351614, 0.3958027, 0.6607333, 0.8541667, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.3541667, 0.0029000] (len=17)
    - name2 [DIV] -> name2 NONE (unique_a21c5b2f) | Vector: [0.8060000, 1.0000000, 0.0002000, 0.0113333, 0.0014833, 0.6545833, 0.0477083, 0.3134600, 0.3445466, 0.3214665, 0.7083333, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.7083333, 0.0027000] (len=17)
      - name3 [DIV] -> name3 NONE (unique_831dabf8) | Vector: [0.2510000, 1.0000000, 0.0003000, 0.0146667, 0.0016167, 0.8864167, 0.0641667, 0.3797386, 0.4117647, 0.4006536, 0.9166667, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.91666

In [24]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import json
import random
from typing import Dict, List, Any

def visualize_groups_on_svg(svg_file_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Read SVG image and create separate images for each group with rectangles around components.
    
    Args:
        svg_file_path: Path to the SVG file
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert SVG to PNG first (you'll need to install cairosvg: pip install cairosvg)
    try:
        import cairosvg
        png_data = cairosvg.svg2png(url=svg_file_path)
        base_image = Image.open(io.BytesIO(png_data))
    except ImportError:
        print("cairosvg not installed. Please install it: pip install cairosvg")
        return
    except Exception as e:
        print(f"Error converting SVG: {e}")
        return
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                abs_box = node.get('node', {}).get('absoluteBoundingBox', {})
                if abs_box and 'node_id' in node:
                    x = abs_box.get('x', 0)
                    y = abs_box.get('y', 0)
                    width = abs_box.get('width', 0)
                    height = abs_box.get('height', 0)
                    
                    group_id = node.get('node_id', 'no_group')
                    
                    if group_id not in node_groups:
                        node_groups[group_id] = []
                    
                    node_groups[group_id].append({
                        'name': node.get('name', 'unnamed'),
                        'x': x,
                        'y': y,
                        'width': width,
                        'height': height
                    })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate random colors for each group
    colors = {}
    for group_id in node_groups.keys():
        colors[group_id] = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255)
        )
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = colors[group_id]
        for node in nodes:
            x, y, width, height = node['x'], node['y'], node['width'], node['height']
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=3
            )
            
            # Add group label
            draw.text((x, y - 20), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")


# Alternative simpler version without SVG conversion (if you have PNG/JPG)
def visualize_groups_on_image(image_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Simpler version that works with PNG/JPG images directly.
    
    Args:
        image_path: Path to the image file (PNG, JPG, etc.)
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Load base image
    base_image = Image.open(image_path)
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                node_info = node.get('node', {})
                x = node_info.get('x', 0)
                y = node_info.get('y', 0)
                width = node_info.get('width', 0)
                height = node_info.get('height', 0)
                
                group_id = node.get('node_id', 'no_group')
                
                if group_id not in node_groups:
                    node_groups[group_id] = []
                
                node_groups[group_id].append({
                    'name': node.get('name', 'unnamed'),
                    'x': x,
                    'y': y,
                    'width': width,
                    'height': height
                })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate colors for each group
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (255, 165, 0),  # Orange
        (128, 0, 128),  # Purple
    ]
    
    color_index = 0
    group_colors = {}
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Assign color to group
        if group_id not in group_colors:
            group_colors[group_id] = colors[color_index % len(colors)]
            color_index += 1
        
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = group_colors[group_id]
        for node in nodes:
            x, y, width, height = int(node['x']), int(node['y']), int(node['width']), int(node['height'])
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=4
            )
            
            # Add group label
            draw.text((x, max(0, y - 25)), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Total groups found: {len(node_groups)}")
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")



with open('modified.json', 'r') as f:
    json_data = json.load(f)

# For PNG/JPG images (simpler):
visualize_groups_on_image('PAGE_109.png', json_data)

# For SVG images (requires cairosvg):
# visualize_groups_on_svg('your_image.svg', json_data)

Saved: ./output/group_depth5_group0.png
Saved: ./output/group_depth2_group0.png
Saved: ./output/group_depth3_group0.png
Saved: ./output/group_depth3_group1.png
Saved: ./output/group_depth1_group0.png
Saved: ./output/group_depth2_group1.png
Saved: ./output/group_depth4_group0.png
Saved: ./output/group_depth5_group1.png
Saved: ./output/group_depth5_group2.png
Saved: ./output/group_depth4_group1.png
Saved: ./output/group_depth4_group2.png
Saved: ./output/group_depth3_group2.png
Saved: ./output/group_depth4_group3.png
Saved: ./output/group_depth5_group3.png
Saved: ./output/group_depth5_group4.png
Saved: ./output/group_depth4_group4.png
Saved: ./output/group_depth4_group5.png
Saved: ./output/group_depth5_group5.png
Saved: ./output/group_depth4_group6.png
Saved: ./output/group_depth2_group2.png
Saved: ./output/group_depth3_group3.png
Saved: ./output/group_depth4_group7.png
Saved: ./output/group_depth3_group4.png
Saved: ./output/group_depth2_group3.png
Saved: ./output/group_depth3_group5.png


In [4]:
with open('PAGE_1_OUTPUT.json', 'w') as f:
    json.dump(result_json, f, indent=2)


In [42]:
print(json.dumps(result_json, indent=2))


{
  "children": [
    {
      "children": [
        {
          "children": [
            {
              "children": [
                {
                  "children": [
                    {
                      "children": [],
                      "name": "Travel",
                      "node": {
                        "StrokeWeight": 0.0,
                        "bottomLeftRadius": 0.0,
                        "bottomRightRadius": 0.0,
                        "characters": "Travel",
                        "fills": [
                          {
                            "blendMode": "NORMAL",
                            "color": {
                              "a": 1.0,
                              "b": 0.37254902720451355,
                              "g": 0.32156863808631897,
                              "r": 0.2666666805744171
                            },
                            "imageRef": "",
                            "type": "SOLID"
                          }


In [76]:
def print_figma_node(node, depth=0):
    indent = "  " * depth  # 2 spaces per level

    # Extract info
    name = node.get("name", "[no name]")
    tag = node.get("tag", "[no tag]")
    node_id = node.get("node_id", "")
    
    # Handle TEXT nodes with characters
    node_data = node.get("node", {})
    characters = node_data.get("characters", "")
    is_text = tag == "TEXT"
    display_name = characters[:10] + "..." if is_text and characters else name

    # Layout info (if present)
    layout = node_data.get("layoutMode", "NONE")
    layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

    # Print current node info
    print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id})")

    # Recursively print children
    for child in node.get("children", []):
        print_figma_node(child, depth + 1)



with open("restored.json", "r") as f:
    restored_data = json.load(f)
print_figma_node(restored_data)

- TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_ad6f6c09)
  - PAGE_115 [DIV] -> PAGE_115 NONE (unique_917b8603)
    - NAVBAR [NAVBAR] -> NAVBAR NONE (unique_a21c5b2f)
      - LIST [LIST] -> LIST NONE (unique_831dabf8)
        - LI [LI] -> LI NONE (group_0)
          - Travel [P] -> Travel NONE (leaf_1d6736ed)
        - LI [LI] -> LI NONE (group_0)
          - Travel [P] -> Travel NONE (leaf_538018bc)
      - TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_4145b98c)
        - ICON [ICON] -> ICON NONE (unique_a9ddecc4)
          - Vector [DIV] -> Vector NONE (leaf_77e8672a)
        - LIST [LIST] -> LIST NONE (unique_6aefeabf)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_9134004b)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_54c1e5d7)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_89f588e2)
    - TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_c3ea6ea6)
     

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import hashlib
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
import logging
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FigmaNodeSimilarityDetector:
    """
    A flexible similarity detection system for Figma nodes using bottom-up approach.
    Feature vectors are extracted directly from leaf nodes and propagated upward.
    """
    
    def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
        """
        Initialize the similarity detector
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            use_semantic_embeddings: Whether to use semantic embeddings for text content
        """
        self.similarity_threshold = similarity_threshold
        self.use_semantic_embeddings = use_semantic_embeddings
        self.node_feature_vectors = {}  # Store feature vectors directly
        self.node_metadata = {}  # Store metadata (is_leaf, etc.)
        self.similarity_matrix = None
        self.clusters = None
        self.node_tree = {}  # Store hierarchical structure
        
        # Initialize semantic model if needed
        if use_semantic_embeddings:
            try:
                self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
                logger.info("Loaded semantic embedding model")
            except Exception as e:
                logger.warning(f"Could not load semantic model: {e}")
                self.use_semantic_embeddings = False
    
    def extract_leaf_node_feature_vector(self, node_data: Dict[str, Any], node_path: str = "") -> np.ndarray:
        """
        Extract feature vector directly from a leaf Figma node

        Args:
            node_data: The node data dictionary
            node_path: Path to the node in the tree

        Returns:
            Numpy array containing the feature vector
        """
        node_info = node_data.get('node', {})
        vector = []

        # 1. Structural Features (normalized)
        node_type = node_info.get('type', '')
        tag = node_data.get('tag', '')
        has_children = len(node_data.get('children', [])) > 0
        num_children = len(node_data.get('children', []))
        node_layout = node_info.get('layout', '')
        vector.extend([
            hash(node_type) % 1000 / 1000.0,  # Normalize hash
            # hash(tag) % 1000 / 1000.0,
            float(has_children),
            min(num_children / 10000.0, 1.0),  # Normalize to 0-1
            hash(node_layout) % 1000 / 1000.0,  # Normalize hash
            (node_info.get('width', 0) * node_info.get('width', 0)) % 100000 / 100000.0 * 10
        ])

        # 2. Style Features
        # Extract font information from textStyle
        text_style = node_info.get('textStyle', {})
        font_family = text_style.get('fontFamily', '')
        font_size = text_style.get('fontSize', 0)
        font_style = text_style.get('fontStyle', '')
        font_weight = text_style.get('fontWeight', 0)

        # Extract fill colors
        fills = node_info.get('fills', [])
        fill_r = fill_g = fill_b = fill_a = 0
        if fills:
            primary_fill = fills[0]
            color = primary_fill.get('color', {})
            fill_r = color.get('r', 0)
            fill_g = color.get('g', 0)
            fill_b = color.get('b', 0)
            fill_a = color.get('a', 1)

        # Extract stroke colors
        strokes = node_info.get('strokes', [])
        stroke_r = stroke_g = stroke_b = stroke_a = 0
        if strokes:
            primary_stroke = strokes[0]
            stroke_color = primary_stroke.get('color', {})
            stroke_r = stroke_color.get('r', 0)
            stroke_g = stroke_color.get('g', 0)
            stroke_b = stroke_color.get('b', 0)
            stroke_a = stroke_color.get('a', 1)

        vector.extend([
            hash(font_family) % 1000 / 1000.0 if font_family else 0,
            min(font_size / 10000.0, 1.0) if font_size else 0,  # Normalize font size to 0-1 (assuming max ~100px)
            hash(font_style) % 1000 / 1000.0 if font_style else 0,
            min(font_weight / 10000.0, 1.0) if font_weight else 0,  # Normalize font weight (max ~900-1000)
            fill_r, fill_g, fill_b, fill_a,
            stroke_r, stroke_g, stroke_b, stroke_a,
        ])

        # 3. Content Features
        text_content = node_info.get('characters', '')
        name = node_data.get('name', '')
        has_text = bool(text_content)

        vector.extend([
            float(has_text),
        ])

        # 4. Semantic Features (using embeddings) - commented out for now
        # if self.use_semantic_embeddings:
        #     text_for_embedding = f"{name} {node_type}"
        #     if text_for_embedding.strip():
        #         try:
        #             semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
        #             vector.extend(semantic_features)
        #         except:
        #             vector.extend([0.0] * 384)  # Default embedding size
        #     else:
        #         vector.extend([0.0] * 384)
        # else:
        #     vector.extend([0.0] * 384)

        # 5. Descendant Count Feature (for leaf nodes, this is 0)
        total_descendants = 0  # Leaf nodes have no descendants
        vector.extend([
            total_descendants
        ])

        return np.array(vector)


    def _calculate_area_weights(self, parent_data: Dict[str, Any], children_data: List[Dict[str, Any]]) -> List[float]:
        """
        Calculate area-based weights for children nodes relative to parent.

        Args:
            parent_data: Parent node data
            children_data: List of children node data

        Returns:
            List of normalized weights (sum to 1.0) or None if calculation fails
        """
        if not children_data:
            return None

        try:
            # Extract parent dimensions
            parent_node = parent_data.get('node', {})
            parent_width = parent_node.get('width', 0)
            parent_height = parent_node.get('height', 0)
            parent_area = parent_width * parent_height

            if parent_area <= 0:
                logger.warning("Parent area is zero or negative, using equal weights")
                return [1.0 / len(children_data)] * len(children_data)

            # Calculate children areas
            children_areas = []
            for child_data in children_data:
                child_node = child_data.get('node', {})
                child_width = child_node.get('width', 0)
                child_height = child_node.get('height', 0)
                child_area = child_width * child_height
                children_areas.append(child_area)

            # Calculate weights as ratio of child area to parent area
            weights = [child_area / parent_area for child_area in children_areas]

            # Normalize weights to sum to 1.0
            total_weight = sum(weights)
            if total_weight > 0:
                normalized_weights = [w / total_weight for w in weights]
            else:
                # If all areas are zero, use equal weights
                normalized_weights = [1.0 / len(children_data)] * len(children_data)

            logger.debug(f"Calculated area weights: {normalized_weights}")
            return normalized_weights

        except Exception as e:
            logger.warning(f"Error calculating area weights: {e}")
            return None
        
    def create_parent_feature_vector(self, children_vectors: List[np.ndarray], node_data: Dict[str, Any], children_data: List[Dict[str, Any]] = None) -> np.ndarray:
        """
        Create parent node feature vector by computing weighted average of children feature vectors
        based on the area ratio of each child to the parent node.

        Args:
            children_vectors: List of children feature vectors
            node_data: Parent node data
            children_data: List of children node data (needed for area calculations)

        Returns:
            Feature vector for parent node
        """
        if not children_vectors:
            # If no children vectors, create a default vector
            default_size = 19  # structural(4) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)

        # Calculate areas and weights
        weights = self._calculate_area_weights(node_data, node_data.get('children', {}))
        
        if weights is None or len(weights) != len(children_vectors):
            # Fallback to simple average if area calculation fails
            logger.warning("Area calculation failed, falling back to simple average")
            parent_vector = np.mean(children_vectors, axis=0)
        else:
            # Calculate weighted average
            weighted_vectors = []
            for i, (vector, weight) in enumerate(zip(children_vectors, weights)):
                weighted_vectors.append(vector * weight)

            parent_vector = np.sum(weighted_vectors, axis=0)
            logger.debug(f"Applied area weights: {weights}")


        # Override some structural features specific to the parent
        node_info = node_data.get('node', {})
        node_type = node_info.get('type', '')
        tag = node_data.get('tag', '')
        num_children = len(children_vectors)
        node_layout = node_info.get('layout', '')

        # Update structural features (first 4 elements)
        parent_vector[0] = hash(node_type) % 1000 / 1000.0
        parent_vector[1] = 1.0  # Parent always has children
        parent_vector[2] = min(num_children / 10000.0, 1.0)
        parent_vector[3] = hash(node_layout) % 1000 / 1000.0
        parent_vector[4] = (node_info.get('width', 0) * node_info.get('width', 0)) % 100000000 / 100000000.0 * 10

        # Calculate total descendants
        total_descendants = num_children
        for child_vector in children_vectors:
            # The descendant count is the last feature in the vector
            child_descendants = int(child_vector[-1] * 10000)  # Denormalize
            total_descendants += child_descendants

        # Update the descendant count feature (last element)
        parent_vector[-1] = min(total_descendants / 10000.0, 1.0)

        return parent_vector


    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """
        Build similarity matrix using bottom-up approach.
        Feature vectors are extracted from leaves and propagated upward.
        
        Args:
            nodes_data: Dictionary containing all node data
            
        Returns:
            Similarity matrix as numpy array
        """
        # Step 1: Build the tree structure and identify leaf nodes
        self.node_tree = self._build_node_tree(nodes_data)
        
        # Step 2: Extract feature vectors bottom-up
        self.node_feature_vectors = {}
        self.node_metadata = {}
        self._extract_feature_vectors_bottom_up(nodes_data)
        
        # Step 3: Filter out leaf nodes for similarity calculation (only compare non-leaf nodes)
        non_leaf_paths = [path for path, metadata in self.node_metadata.items() if not metadata['is_leaf']]
        self.node_paths = non_leaf_paths
        
        if not self.node_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 4: Get feature vectors for non-leaf nodes only
        feature_vectors = []
        for node_path in self.node_paths:
            vector = self.node_feature_vectors[node_path]
            feature_vectors.append(vector)
        
        feature_vectors = np.array(feature_vectors)
        
        # Step 5: Calculate similarity matrix using cosine similarity
        self.similarity_matrix = cosine_similarity(feature_vectors)
        
        return self.similarity_matrix
    
    def _build_node_tree(self, data: Dict[str, Any], path: str = "", parent_path: str = None) -> Dict[str, Dict]:
        """Build a tree structure mapping node paths to their metadata"""
        tree = {}
        
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                tree[current_path] = {
                    'data': data,
                    'parent': parent_path,
                    'children': [],
                    'is_leaf': len(data.get('children', [])) == 0
                }
                
                if 'children' in data:
                    for child in data['children']:
                        child_tree = self._build_node_tree(child, current_path, current_path)
                        tree.update(child_tree)
                        # Add child paths to current node
                        for child_path in child_tree.keys():
                            if child_tree[child_path]['parent'] == current_path:
                                tree[current_path]['children'].append(child_path)
        
        return tree
    
    def _extract_feature_vectors_bottom_up(self, data: Dict[str, Any], path: str = ""):
        """Extract feature vectors using bottom-up approach"""
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # First, process all children
                children_vectors = []
                if 'children' in data:
                    for child in data['children']:
                        child_path = f"{current_path}"
                        self._extract_feature_vectors_bottom_up(child, child_path)
                        
                        # Get child path and vector
                        child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                        if child_node_path in self.node_feature_vectors:
                            children_vectors.append(self.node_feature_vectors[child_node_path])
                
                # Extract feature vector for current node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:  # Leaf node
                    self.node_feature_vectors[current_path] = self.extract_leaf_node_feature_vector(data, current_path)
                else:  # Parent node - aggregate children vectors
                    self.node_feature_vectors[current_path] = self.create_parent_feature_vector(children_vectors, data)
                
                # Store metadata
                self.node_metadata[current_path] = {
                    'is_leaf': is_leaf,
                    'num_children': len(children_vectors),
                    'node_type': data.get('node', {}).get('type', ''),
                    'name': data.get('name', '')
                }
    
    def calculate_group_average_vectors(self, similarity_groups: Dict[str, List[str]]) -> Dict[str, np.ndarray]:
        """
        Calculate average feature vector for each similarity group
        
        Args:
            similarity_groups: Dictionary mapping group_id to list of node paths
            
        Returns:
            Dictionary mapping group_id to average feature vector
        """
        group_vectors = {}
        
        for group_id, node_paths in similarity_groups.items():
            group_feature_vectors = []
            for node_path in node_paths:
                if node_path in self.node_feature_vectors:
                    group_feature_vectors.append(self.node_feature_vectors[node_path])
            
            if group_feature_vectors:
                # Calculate average vector for the group
                group_vectors[group_id] = np.mean(group_feature_vectors, axis=0)
            else:
                logger.warning(f"No feature vectors found for group {group_id}")
        
        return group_vectors

    def merge_similar_groups(self, similarity_groups: Dict[str, List[str]], 
                            merge_threshold: float = 0.9) -> Dict[str, List[str]]:
        """
        Merge groups that have similar average feature vectors

        Args:
            similarity_groups: Original similarity groups
            merge_threshold: Threshold for merging groups (default 0.9)

        Returns:
            Merged similarity groups
        """
        if not similarity_groups:
            return similarity_groups

        # Calculate average vectors for each group
        group_vectors = self.calculate_group_average_vectors(similarity_groups)

        if len(group_vectors) <= 1:
            return similarity_groups

        # Create similarity matrix between group average vectors
        group_ids = list(group_vectors.keys())
        group_vector_matrix = np.array([group_vectors[group_id] for group_id in group_ids])
        group_similarity_matrix = cosine_similarity(group_vector_matrix)

        logger.warning(f"group_vector_matrix: {group_vector_matrix}")


        # Find groups to merge based on similarity threshold
        merged_groups = {}
        processed_groups = set()
        merge_counter = 0

        for i, group_id_i in enumerate(group_ids):
            if group_id_i in processed_groups:
                continue

            # Find all groups similar to current group
            similar_group_indices = np.where(group_similarity_matrix[i] >= merge_threshold)[0]
            groups_to_merge = [group_ids[j] for j in similar_group_indices]

            if len(groups_to_merge) > 1:  # If we found groups to merge
                # Create new merged group
                merged_group_id = f"merged_group_{merge_counter}"
                merged_nodes = []

                for group_to_merge in groups_to_merge:
                    merged_nodes.extend(similarity_groups[group_to_merge])
                    processed_groups.add(group_to_merge)

                merged_groups[merged_group_id] = merged_nodes
                merge_counter += 1

                logger.info(f"Merged groups {groups_to_merge} into {merged_group_id}")
            else:
                # Keep original group if no merging needed
                merged_groups[group_id_i] = similarity_groups[group_id_i]
                processed_groups.add(group_id_i)

        return merged_groups
    
    # Modified check_similarity function
    def check_similarity(self, threshold: float = None, merge_threshold: float = 0.9, 
                        enable_group_merging: bool = False) -> Dict[str, List[str]]:
        """
        Check similarity between non-leaf nodes and return groups of similar nodes.
        Optionally merge similar groups based on their average feature vectors.

        Args:
            threshold: Similarity threshold for initial grouping (uses instance threshold if None)
            merge_threshold: Threshold for merging groups (default 0.9)
            enable_group_merging: Whether to enable group merging stage

        Returns:
            Dictionary mapping group_id to list of similar node paths (non-leaf nodes only)
        """
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")

        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}

        threshold = threshold or self.similarity_threshold

        # Step 1: Initial grouping based on node similarity
        similarity_groups = self._threshold_based_grouping(threshold)
        logger.info(f"Initial grouping created {len(similarity_groups)} groups")

        # Step 2: Merge similar groups if enabled
        if enable_group_merging and len(similarity_groups) > 1:
            merged_groups = self.merge_similar_groups(similarity_groups, merge_threshold)
            logger.info(f"After merging: {len(merged_groups)} groups remain")
            return merged_groups

        return similarity_groups
    
    def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
        """Threshold-based similarity grouping for non-leaf nodes only"""
        groups = {}
        assigned = set()
        group_counter = 0
        
        for i in range(len(self.node_paths)):
            if self.node_paths[i] in assigned:
                continue
                
            # Find all nodes similar to current node
            similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
            similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
            if similar_nodes:  # If we found similar nodes
                group_id = f"group_{group_counter}"
                groups[group_id] = [self.node_paths[i]] + similar_nodes
                assigned.update(groups[group_id])
                group_counter += 1
        
        return groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """
        Add node_id to each node in the original JSON based on similarity groups.
        Only non-leaf nodes get group IDs, leaf nodes get unique IDs.
        
        Args:
            original_data: Original JSON data
            similarity_groups: Groups of similar nodes (non-leaf only)
            
        Returns:
            Modified JSON with node_id added to each node
        """
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
        return modified_data
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:
                    # Leaf nodes get unique IDs
                    result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data:
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data

    def get_feature_vector_info(self) -> Dict[str, Any]:
        """
        Get information about the feature vectors for debugging/analysis
        
        Returns:
            Dictionary containing feature vector statistics
        """
        if not self.node_feature_vectors:
            return {"error": "No feature vectors extracted yet"}
        
        vector_lengths = [len(v) for v in self.node_feature_vectors.values()]
        leaf_count = sum(1 for metadata in self.node_metadata.values() if metadata['is_leaf'])
        non_leaf_count = len(self.node_metadata) - leaf_count
        
        return {
            "total_nodes": len(self.node_feature_vectors),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "feature_vector_length": vector_lengths[0] if vector_lengths else 0,
            "feature_breakdown": {
                "structural_features": 3,
                "style_features": 12,  # Updated: font_family, font_size, font_style, font_weight + colors
                "content_features": 1,
                "descendant_count": 1,
                "semantic_features": 384 if self.use_semantic_embeddings else 0
            }
        }

    def print_figma_tree_with_vectors(self, node, depth=0, path=""):
        """
        Print the Figma node tree with feature vectors
        
        Args:
            node: The current node to print
            depth: Current depth in the tree
            path: Current path to the node
        """
        indent = "  " * depth  # 2 spaces per level

        # Extract info
        name = node.get("name", "[no name]")
        tag = node.get("tag", "[no tag]")
        node_id = node.get("node_id", "")
        
        # Handle TEXT nodes with characters
        node_data = node.get("node", {})
        characters = node_data.get("characters", "")
        is_text = tag == "TEXT"
        display_name = characters[:10] + "..." if is_text and characters else name

        # Layout info (if present)
        layout = node_data.get("layoutMode", "NONE")
        layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

        # Get the current node path
        current_path = f"{path}/{name}" if path else name
        
        # Get feature vector if available
        vector_str = ""
        if current_path in self.node_feature_vectors:
            vector = self.node_feature_vectors[current_path]
            # Format vector to show first few values and some key features
        if len(vector) > 0:
            vector_str = f" | Vector: [{', '.join(f'{v:.7f}' for v in vector)}] (len={len(vector)})"
        else:
            vector_str = " | Vector: Not found"

        # Print current node info with feature vector
        print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id}){vector_str}")

        # Recursively print children
        for child in node.get("children", []):
            self.print_figma_tree_with_vectors(child, depth + 1, current_path)






app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:5123"],  # or ["*"] for dev
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)



class JSONData(BaseModel):
    data: Dict[str, Any]

class SimilarityResponse(BaseModel):
    processed_data: Dict[str, Any]
    similarity_groups: Dict[str, Any]  # Replace with your actual similarity groups type

def rename_names_and_store_mapping(node, counter, mapping):
    """Recursively rename 'name' fields and store mapping"""
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

def restore_names_from_mapping(node, mapping):
    """Recursively restore original names from mapping"""
    if isinstance(node, dict):
        if "name" in node and node["name"] in mapping:
            node["name"] = mapping[node["name"]]
        for key in node:
            restore_names_from_mapping(node[key], mapping)
    elif isinstance(node, list):
        for item in node:
            restore_names_from_mapping(item, mapping)


        




In [17]:
with open("PAGE_109.json", "r") as f:
    data = json.load(f)

# Rename names and create mapping
name_mapping = {}
counter = [0]
rename_names_and_store_mapping(data, counter, name_mapping)

# Process with similarity detector
detector = FigmaNodeSimilarityDetector(
    similarity_threshold=0.99999999999,
    use_semantic_embeddings=True
)

similarity_matrix = detector.build_similarity_matrix(data)
similarity_groups = detector.check_similarity()
result_json = detector.add_node_ids_to_json(data, similarity_groups)

detector.print_figma_tree_with_vectors(result_json)

# Restore original names
restore_names_from_mapping(result_json, name_mapping)


with open("output.json", "w", encoding="utf-8") as f:
    json.dump(result_json, f, indent=2, ensure_ascii=False)



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loaded semantic embedding model
INFO:__main__:Initial grouping created 27 groups


- name0 [DIV] -> name0 NONE (unique_ad6f6c09) | Vector: [0.0740000, 1.0000000, 0.0010000, 0.1830000, 0.2130323, 0.4046869, 0.0011407, 0.2533890, 0.0323086, 0.4276959, 0.4132518, 0.4247653, 0.8400166, 0.0031897, 0.0037070, 0.0056898, 0.0219832, 0.5822833, 0.0321000] (len=19)
  - name1 [DIV] -> name1 NONE (unique_917b8603) | Vector: [0.0740000, 1.0000000, 0.0002000, 0.1830000, 0.2130323, 0.1871235, 0.0011073, 0.0807582, 0.0177326, 0.5423792, 0.4901820, 0.4955196, 1.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.2692424, 0.0051000] (len=19)
    - name2 [DIV] -> name2 NONE (unique_a21c5b2f) | Vector: [0.4010000, 1.0000000, 0.0002000, 0.1830000, 0.2073600, 0.2353531, 0.0013926, 0.1015730, 0.0223030, 0.6417431, 0.5012967, 0.3654937, 1.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.3386376, 0.0049000] (len=19)
      - name3 [DIV] -> name3 NONE (unique_831dabf8) | Vector: [0.0740000, 1.0000000, 0.0003000, 0.5160000, 0.0933156, 0.6687483, 0.0015458, 0.2367080, 0.0673559, 0.64664

Fetching SVG from Figma link: https://www.figma.com/design/QH376ySfLFWfeSALVAldfZ/DES2RACT-DATASET?node-id=179-5984&t=p4qCK6QigTqtPSa7-0
Adding group rectangles to SVG...
Generated 1 SVG variants
Saved SVG to my_figma_design_all_groups.svg
SVG processing functions ready to use!


In [28]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import json
import random
from typing import Dict, List, Any

def visualize_groups_on_svg(svg_file_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Read SVG image and create separate images for each group with rectangles around components.
    
    Args:
        svg_file_path: Path to the SVG file
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert SVG to PNG first (you'll need to install cairosvg: pip install cairosvg)
    try:
        import cairosvg
        png_data = cairosvg.svg2png(url=svg_file_path)
        base_image = Image.open(io.BytesIO(png_data))
    except ImportError:
        print("cairosvg not installed. Please install it: pip install cairosvg")
        return
    except Exception as e:
        print(f"Error converting SVG: {e}")
        return
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                abs_box = node.get('node', {}).get('absoluteBoundingBox', {})
                if abs_box and 'node_id' in node:
                    x = abs_box.get('x', 0)
                    y = abs_box.get('y', 0)
                    width = abs_box.get('width', 0)
                    height = abs_box.get('height', 0)
                    
                    group_id = node.get('node_id', 'no_group')
                    
                    if group_id not in node_groups:
                        node_groups[group_id] = []
                    
                    node_groups[group_id].append({
                        'name': node.get('name', 'unnamed'),
                        'x': x,
                        'y': y,
                        'width': width,
                        'height': height
                    })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate random colors for each group
    colors = {}
    for group_id in node_groups.keys():
        colors[group_id] = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255)
        )
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = colors[group_id]
        for node in nodes:
            x, y, width, height = node['x'], node['y'], node['width'], node['height']
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=3
            )
            
            # Add group label
            draw.text((x, y - 20), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")


# Alternative simpler version without SVG conversion (if you have PNG/JPG)
def visualize_groups_on_image(image_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Simpler version that works with PNG/JPG images directly.
    
    Args:
        image_path: Path to the image file (PNG, JPG, etc.)
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Load base image
    base_image = Image.open(image_path)
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                node_info = node.get('node', {})
                x = node_info.get('x', 0)
                y = node_info.get('y', 0)
                width = node_info.get('width', 0)
                height = node_info.get('height', 0)
                
                group_id = node.get('node_id', 'no_group')
                
                if group_id not in node_groups:
                    node_groups[group_id] = []
                
                node_groups[group_id].append({
                    'name': node.get('name', 'unnamed'),
                    'x': x,
                    'y': y,
                    'width': width,
                    'height': height
                })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate colors for each group
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (255, 165, 0),  # Orange
        (128, 0, 128),  # Purple
    ]
    
    color_index = 0
    group_colors = {}
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Assign color to group
        if group_id not in group_colors:
            group_colors[group_id] = colors[color_index % len(colors)]
            color_index += 1
        
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = group_colors[group_id]
        for node in nodes:
            x, y, width, height = int(node['x']), int(node['y']), int(node['width']), int(node['height'])
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=4
            )
            
            # Add group label
            draw.text((x, max(0, y - 25)), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Total groups found: {len(node_groups)}")
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")



with open('output.json', 'r') as f:
    json_data = json.load(f)

# For PNG/JPG images (simpler):
visualize_groups_on_image('PAGE_109.png', json_data)

# For SVG images (requires cairosvg):
# visualize_groups_on_svg('your_image.svg', json_data)

Saved: ./output/group_unique.png
Saved: ./output/group_group_26.png
Saved: ./output/group_group_35.png
Total groups found: 239
Created 3 group visualization images


In [None]:
import json
import numpy as np
from typing import Dict, List, Any, Tuple, Set
from dataclasses import dataclass
import hashlib
import logging
from collections import defaultdict
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class NodeSignature:
    """Represents a compact signature of a node for comparison"""
    tag: str
    node_type: str
    has_text: bool
    text_content: str
    style_hash: str
    layout_mode: str
    children_count: int
    children_signatures: List[str]  # Ordered list of child signatures
    
    def __post_init__(self):
        # Normalize text content for comparison
        self.text_content = self.text_content.strip().lower() if self.text_content else ""
        # Create a signature string for this node
        self.signature = self._create_signature()
    
    def _create_signature(self) -> str:
        """Create a unique signature string for this node"""
        children_sig = "|".join(self.children_signatures)
        return f"{self.tag}:{self.node_type}:{self.has_text}:{self.style_hash}:{self.layout_mode}:{self.children_count}:{children_sig}"

class MinEditDistanceSemanticGrouper:
    """
    Semantic grouper using minimum edit distance approach for tree comparison.
    This approach compares the structural similarity of component trees.
    """
    
    def __init__(self, 
                 similarity_threshold: float = 0.8,
                 structure_weight: float = 0.6,
                 style_weight: float = 0.3,
                 content_weight: float = 0.1,
                 ignore_text_content: bool = False):
        """
        Initialize the min edit distance grouper
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            structure_weight: Weight for structural similarity
            style_weight: Weight for style similarity  
            content_weight: Weight for content similarity
            ignore_text_content: Whether to ignore text content in comparison
        """
        self.similarity_threshold = similarity_threshold
        self.structure_weight = structure_weight
        self.style_weight = style_weight
        self.content_weight = content_weight
        self.ignore_text_content = ignore_text_content
        
        self.node_signatures = {}  # path -> NodeSignature
        self.node_trees = {}  # path -> tree structure
        self.similarity_matrix = None
        self.non_leaf_paths = []
        
    def extract_node_signature(self, node_data: Dict[str, Any]) -> NodeSignature:
        """Extract signature from a node"""
        node_info = node_data.get('node', {})
        tag = node_data.get('tag', '')
        name = node_data.get('name', '')
        
        # Check if tag is ICON or SVG - if so, return default signature
        if tag in ["ICON", "SVG"]:
            return NodeSignature(
                tag=tag,
                node_type="ICON_SVG",
                has_text=False,
                text_content="",
                style_hash="icon_svg_default",
                layout_mode="NONE",
                children_count=0,
                children_signatures=[]
            )
        
        # Basic properties
        node_type = node_info.get('type', '')
        has_text = bool(node_info.get('characters', ''))
        text_content = node_info.get('characters', '') if not self.ignore_text_content else ""
        layout_mode = node_info.get('layoutMode', 'NONE')
        children_count = len(node_data.get('children', []))
        
        # Style hash (simplified - you can expand this)
        style_hash = self._compute_style_hash(node_info)
        
        # Children signatures will be filled later
        children_signatures = []
        
        return NodeSignature(
            tag=tag,
            node_type=node_type,
            has_text=has_text,
            text_content=text_content,
            style_hash=style_hash,
            layout_mode=layout_mode,
            children_count=children_count,
            children_signatures=children_signatures
        )
    
    def _compute_style_hash(self, node_info: Dict[str, Any]) -> str:
        """Compute a hash representing the style properties of a node"""
        style_props = []
        
        # Text style
        text_style = node_info.get('textStyle', {})
        if text_style:
            style_props.extend([
                text_style.get('fontFamily', ''),
                str(text_style.get('fontSize', 0)),
                text_style.get('fontStyle', ''),
                str(text_style.get('fontWeight', 0))
            ])
        
        # Fill colors
        fills = node_info.get('fills', [])
        if fills:
            fill = fills[0]
            color = fill.get('color', {})
            style_props.extend([
                str(color.get('r', 0)),
                str(color.get('g', 0)),
                str(color.get('b', 0)),
                str(color.get('a', 1))
            ])
        
        # Stroke colors
        strokes = node_info.get('strokes', [])
        if strokes:
            stroke = strokes[0]
            color = stroke.get('color', {})
            style_props.extend([
                str(color.get('r', 0)),
                str(color.get('g', 0)),
                str(color.get('b', 0)),
                str(color.get('a', 1))
            ])
        
        # Layout properties
        style_props.extend([
            str(node_info.get('layoutMode', 'NONE')),
            str(node_info.get('paddingLeft', 0)),
            str(node_info.get('paddingRight', 0)),
            str(node_info.get('paddingTop', 0)),
            str(node_info.get('paddingBottom', 0)),
            str(node_info.get('itemSpacing', 0))
        ])
        
        style_string = "|".join(style_props)
        return hashlib.md5(style_string.encode()).hexdigest()[:8]
    
    def build_node_signatures(self, data: Dict[str, Any], path: str = "") -> None:
        """Build node signatures recursively"""
        if isinstance(data, dict) and 'node' in data:
            current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
            
            # Extract signature for current node
            signature = self.extract_node_signature(data)
            
            # Check if this is an ICON or SVG node
            tag = data.get('tag', '')
            if tag in ["ICON", "SVG"]:
                # For ICON/SVG nodes, don't process children and store as leaf
                signature.children_signatures = []
                signature.signature = signature._create_signature()
                self.node_signatures[current_path] = signature
                
                # Store tree structure as leaf node
                self.node_trees[current_path] = {
                    'data': data,
                    'children': [],
                    'is_leaf': True,
                    'is_icon_svg': True
                }
                return
            
            # Process children first to get their signatures
            children_signatures = []
            if 'children' in data:
                for child in data['children']:
                    child_path = f"{current_path}"
                    self.build_node_signatures(child, child_path)
                    child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                    if child_node_path in self.node_signatures:
                        children_signatures.append(self.node_signatures[child_node_path].signature)
            
            # Update signature with children signatures
            signature.children_signatures = children_signatures
            signature.signature = signature._create_signature()
            
            self.node_signatures[current_path] = signature
            
            # Store tree structure
            self.node_trees[current_path] = {
                'data': data,
                'children': [f"{current_path}/{child.get('name', 'unnamed')}" for child in data.get('children', [])],
                'is_leaf': len(data.get('children', [])) == 0,
                'is_icon_svg': False
            }
    
    def tree_edit_distance(self, tree1_path: str, tree2_path: str) -> float:
        """
        Calculate tree edit distance between two trees
        Returns normalized distance (0 = identical, 1 = completely different)
        """
        if tree1_path not in self.node_signatures or tree2_path not in self.node_signatures:
            return 1.0
        
        sig1 = self.node_signatures[tree1_path]
        sig2 = self.node_signatures[tree2_path]
        
        # Special handling for ICON/SVG nodes
        tree1_info = self.node_trees.get(tree1_path, {})
        tree2_info = self.node_trees.get(tree2_path, {})
        
        # If both are ICON/SVG nodes, they are considered identical
        if (tree1_info.get('is_icon_svg', False) and tree2_info.get('is_icon_svg', False)):
            return 0.0
        
        # If only one is ICON/SVG, they are considered very different
        if (tree1_info.get('is_icon_svg', False) or tree2_info.get('is_icon_svg', False)):
            return 1.0
        
        # Use dynamic programming to calculate edit distance
        return self._tree_edit_distance_dp(sig1, sig2)
    
    def _tree_edit_distance_dp(self, sig1: NodeSignature, sig2: NodeSignature) -> float:
        """Dynamic programming implementation of tree edit distance"""
        
        # Node-level similarity
        node_similarity = self._node_similarity(sig1, sig2)
        
        # If nodes are very different, return high distance
        if node_similarity < 0.1:
            return 1.0
        
        # Structure similarity based on children
        children1 = sig1.children_signatures
        children2 = sig2.children_signatures
        
        if not children1 and not children2:
            # Both are leaf nodes, return distance based on node similarity
            return 1.0 - node_similarity
        
        if not children1 or not children2:
            # One is leaf, other is not
            structure_penalty = 0.5
            return 1.0 - (node_similarity * (1.0 - structure_penalty))
        
        # Both have children - calculate edit distance on children sequences
        children_distance = self._sequence_edit_distance(children1, children2)
        
        # Combine node similarity and children structure similarity
        total_similarity = (
            self.structure_weight * (1.0 - children_distance) +
            (self.style_weight + self.content_weight) * node_similarity
        )
        
        return 1.0 - total_similarity
    
    def _node_similarity(self, sig1: NodeSignature, sig2: NodeSignature) -> float:
        """Calculate similarity between two node signatures"""
        # Special handling for ICON/SVG nodes
        if sig1.tag in ["ICON", "SVG"] and sig2.tag in ["ICON", "SVG"]:
            return 1.0  # All ICON/SVG nodes are considered similar
        
        if sig1.tag in ["ICON", "SVG"] or sig2.tag in ["ICON", "SVG"]:
            return 0.0  # ICON/SVG nodes are not similar to other node types
        
        similarities = []
        
        # Tag and type similarity
        tag_sim = 1.0 if sig1.tag == sig2.tag else 0.0
        type_sim = 1.0 if sig1.node_type == sig2.node_type else 0.0
        
        # Layout similarity
        layout_sim = 1.0 if sig1.layout_mode == sig2.layout_mode else 0.0
        
        # Style similarity
        style_sim = 1.0 if sig1.style_hash == sig2.style_hash else 0.0
        
        # Content similarity
        content_sim = 1.0
        if not self.ignore_text_content:
            if sig1.has_text and sig2.has_text:
                content_sim = self._text_similarity(sig1.text_content, sig2.text_content)
            elif sig1.has_text != sig2.has_text:
                content_sim = 0.0
        
        # Children count similarity
        max_children = max(sig1.children_count, sig2.children_count)
        children_count_sim = 1.0 - abs(sig1.children_count - sig2.children_count) / max(max_children, 1)
        
        # Weighted combination
        total_similarity = (
            self.structure_weight * (tag_sim * 0.4 + type_sim * 0.3 + layout_sim * 0.3) +
            self.style_weight * style_sim +
            self.content_weight * content_sim
        ) * children_count_sim
        
        return total_similarity
    
    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using simple metrics"""
        if not text1 and not text2:
            return 1.0
        if not text1 or not text2:
            return 0.0
        
        # Exact match
        if text1 == text2:
            return 1.0
        
        # Length similarity
        max_len = max(len(text1), len(text2))
        len_sim = 1.0 - abs(len(text1) - len(text2)) / max_len
        
        # Common characters ratio
        common_chars = len(set(text1) & set(text2))
        total_chars = len(set(text1) | set(text2))
        char_sim = common_chars / max(total_chars, 1)
        
        return (len_sim + char_sim) / 2.0
    
    def _sequence_edit_distance(self, seq1: List[str], seq2: List[str]) -> float:
        """Calculate normalized edit distance between two sequences"""
        if not seq1 and not seq2:
            return 0.0
        
        if not seq1 or not seq2:
            return 1.0
        
        # Dynamic programming for edit distance
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        # Initialize base cases
        for i in range(m + 1):
            dp[i][0] = i
        for j in range(n + 1):
            dp[0][j] = j
        
        # Fill the dp table
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
        
        # Normalize by maximum possible distance
        max_distance = max(m, n)
        return dp[m][n] / max_distance if max_distance > 0 else 0.0
    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """Build similarity matrix using min edit distance approach"""
        
        # Step 1: Build node signatures
        self.build_node_signatures(nodes_data)
        
        # Step 2: Filter non-leaf nodes for comparison (excluding ICON/SVG nodes)
        self.non_leaf_paths = [
            path for path, tree_info in self.node_trees.items() 
            if not tree_info['is_leaf'] and not tree_info.get('is_icon_svg', False)
        ]
        
        if not self.non_leaf_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 3: Calculate similarity matrix
        n = len(self.non_leaf_paths)
        similarity_matrix = np.zeros((n, n))
        
        for i in range(n):
            for j in range(n):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                else:
                    # Calculate edit distance and convert to similarity
                    edit_distance = self.tree_edit_distance(
                        self.non_leaf_paths[i], 
                        self.non_leaf_paths[j]
                    )
                    similarity_matrix[i][j] = 1.0 - edit_distance
        
        self.similarity_matrix = similarity_matrix
        return similarity_matrix
    
    def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
        """Find groups of similar nodes based on edit distance"""
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")
        
        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}
        
        threshold = threshold or self.similarity_threshold
        
        # Use connected components to find similarity groups
        return self._find_similarity_groups(threshold)
    
    def _find_similarity_groups(self, threshold: float) -> Dict[str, List[str]]:
        """Find connected components of similar nodes"""
        n = len(self.non_leaf_paths)
        visited = [False] * n
        groups = {}
        group_counter = 0
        
        def dfs(node_idx: int, group: List[int]):
            visited[node_idx] = True
            group.append(node_idx)
            
            for j in range(n):
                if not visited[j] and self.similarity_matrix[node_idx][j] >= threshold:
                    dfs(j, group)
        
        for i in range(n):
            if not visited[i]:
                group = []
                dfs(i, group)
                
                if len(group) > 1:  # Only create groups with multiple nodes
                    group_id = f"group_{group_counter}"
                    groups[group_id] = [self.non_leaf_paths[idx] for idx in group]
                    group_counter += 1
        
        return groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """Add node_id to each node based on similarity groups"""
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        return self._add_node_ids_recursive(original_data, path_to_group)
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node or ICON/SVG node
                tag = data.get('tag', '')
                is_leaf = len(data.get('children', [])) == 0
                is_icon_svg = tag in ["ICON", "SVG"]
                
                if is_leaf or is_icon_svg:
                    # Leaf nodes and ICON/SVG nodes get unique IDs
                    if is_icon_svg:
                        result['node_id'] = f"icon_svg_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                    else:
                        result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data and not data.get('tag', '') in ["ICON", "SVG"]:
                # Don't process children for ICON/SVG nodes
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data
    
    def get_analysis_info(self) -> Dict[str, Any]:
        """Get analysis information for debugging"""
        if not self.node_signatures:
            return {"error": "No analysis performed yet"}
        
        leaf_count = sum(1 for tree_info in self.node_trees.values() if tree_info['is_leaf'])
        non_leaf_count = len(self.node_trees) - leaf_count
        icon_svg_count = sum(1 for tree_info in self.node_trees.values() if tree_info.get('is_icon_svg', False))
        
        return {
            "total_nodes": len(self.node_signatures),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "icon_svg_nodes": icon_svg_count,
            "comparison_approach": "minimum_edit_distance",
            "weights": {
                "structure": self.structure_weight,
                "style": self.style_weight,
                "content": self.content_weight
            },
            "ignore_text_content": self.ignore_text_content,
            "icon_svg_handling": "treated_as_identical_leaf_nodes"
        }




# Example usage and testing
def rename_names_and_store_mapping(node, counter, mapping):
    """Utility function to rename nodes for testing"""
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

# Main execution example
if __name__ == "__main__":
    # Load and preprocess data
    with open("PAGE_109.json", "r") as f:
        data = json.load(f)
    
    # Rename names and save mapping
    name_mapping = {}
    rename_names_and_store_mapping(data, [0], name_mapping)
    
    # Save modified JSON and mapping
    with open("modified_min_edit.json", "w") as f:
        json.dump(data, f, indent=2)
    
    with open("name_mapping_min_edit.json", "w") as f:
        json.dump(name_mapping, f, indent=2)
    
    # Initialize the min edit distance grouper
    grouper = MinEditDistanceSemanticGrouper(
        similarity_threshold=0.99999,  # Higher threshold for more precise grouping
        structure_weight=0.6,
        style_weight=0.3,
        content_weight=0.1,
        ignore_text_content=False
    )
    
    # Build similarity matrix
    similarity_matrix = grouper.build_similarity_matrix(data)
    
    # Find similar groups
    similarity_groups = grouper.check_similarity()
    print("Similarity groups found:", similarity_groups)
    
    # Add node_ids to original JSON
    result_json = grouper.add_node_ids_to_json(data, similarity_groups)
    
    # Save result
    with open("result_min_edit.json", "w") as f:
        json.dump(result_json, f, indent=2)
    
    # Print analysis info
    analysis = grouper.get_analysis_info()
    print("Analysis info:", json.dumps(analysis, indent=2))

Similarity groups found: {'group_0': ['name0/name1/name2/name3/name5/name6', 'name0/name1/name2/name3/name5/name8', 'name0/name1/name2/name3/name5/name10', 'name0/name1/name2/name3/name5/name12'], 'group_1': ['name0/name53/name54/name58', 'name0/name83/name89/name93'], 'group_2': ['name0/name53/name54', 'name0/name83/name89'], 'group_3': ['name0/name53/name62/name63', 'name0/name53/name62/name72'], 'group_4': ['name0/name97', 'name0/name196'], 'group_5': ['name0/name101/name102/name114/name115/name116/name118', 'name0/name101/name149/name161/name162/name163/name165'], 'group_6': ['name0/name101/name102/name114/name115/name116', 'name0/name101/name149/name161/name162/name163'], 'group_7': ['name0/name101/name102/name114/name115/name124', 'name0/name101/name149/name161/name162/name171'], 'group_8': ['name0/name101/name102/name114/name115', 'name0/name101/name149/name161/name162'], 'group_9': ['name0/name101/name102/name114/name129', 'name0/name101/name149/name161/name176'], 'group_10': [

In [21]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import json
import random
from typing import Dict, List, Any

def visualize_groups_on_svg(svg_file_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Read SVG image and create separate images for each group with rectangles around components.
    
    Args:
        svg_file_path: Path to the SVG file
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert SVG to PNG first (you'll need to install cairosvg: pip install cairosvg)
    try:
        import cairosvg
        png_data = cairosvg.svg2png(url=svg_file_path)
        base_image = Image.open(io.BytesIO(png_data))
    except ImportError:
        print("cairosvg not installed. Please install it: pip install cairosvg")
        return
    except Exception as e:
        print(f"Error converting SVG: {e}")
        return
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                abs_box = node.get('node', {}).get('absoluteBoundingBox', {})
                if abs_box and 'node_id' in node:
                    x = abs_box.get('x', 0)
                    y = abs_box.get('y', 0)
                    width = abs_box.get('width', 0)
                    height = abs_box.get('height', 0)
                    
                    group_id = node.get('node_id', 'no_group')
                    
                    if group_id not in node_groups:
                        node_groups[group_id] = []
                    
                    node_groups[group_id].append({
                        'name': node.get('name', 'unnamed'),
                        'x': x,
                        'y': y,
                        'width': width,
                        'height': height
                    })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate random colors for each group
    colors = {}
    for group_id in node_groups.keys():
        colors[group_id] = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255)
        )
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = colors[group_id]
        for node in nodes:
            x, y, width, height = node['x'], node['y'], node['width'], node['height']
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=3
            )
            
            # Add group label
            draw.text((x, y - 20), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")


# Alternative simpler version without SVG conversion (if you have PNG/JPG)
def visualize_groups_on_image(image_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Simpler version that works with PNG/JPG images directly.
    
    Args:
        image_path: Path to the image file (PNG, JPG, etc.)
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Load base image
    base_image = Image.open(image_path)
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                node_info = node.get('node', {})
                x = node_info.get('x', 0)
                y = node_info.get('y', 0)
                width = node_info.get('width', 0)
                height = node_info.get('height', 0)
                
                group_id = node.get('node_id', 'no_group')
                
                if group_id not in node_groups:
                    node_groups[group_id] = []
                
                node_groups[group_id].append({
                    'name': node.get('name', 'unnamed'),
                    'x': x,
                    'y': y,
                    'width': width,
                    'height': height
                })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate colors for each group
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (255, 165, 0),  # Orange
        (128, 0, 128),  # Purple
    ]
    
    color_index = 0
    group_colors = {}
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Assign color to group
        if group_id not in group_colors:
            group_colors[group_id] = colors[color_index % len(colors)]
            color_index += 1
        
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = group_colors[group_id]
        for node in nodes:
            x, y, width, height = int(node['x']), int(node['y']), int(node['width']), int(node['height'])
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=4
            )
            
            # Add group label
            draw.text((x, max(0, y - 25)), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Total groups found: {len(node_groups)}")
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")



with open('result_min_edit.json', 'r') as f:
    json_data = json.load(f)

# For PNG/JPG images (simpler):
visualize_groups_on_image('PAGE_109.png', json_data)

# For SVG images (requires cairosvg):
# visualize_groups_on_svg('your_image.svg', json_data)

Saved: ./output/group_0.png
Saved: ./output/no_group.png
Saved: ./output/group_2.png
Saved: ./output/group_1.png
Saved: ./output/group_3.png
Saved: ./output/group_4.png
Saved: ./output/group_14.png
Saved: ./output/group_13.png
Saved: ./output/group_8.png
Saved: ./output/group_6.png
Saved: ./output/group_5.png
Saved: ./output/group_7.png
Saved: ./output/group_9.png
Saved: ./output/group_11.png
Saved: ./output/group_10.png
Saved: ./output/group_12.png
Saved: ./output/group_18.png
Saved: ./output/group_16.png
Saved: ./output/group_15.png
Saved: ./output/group_17.png
Saved: ./output/group_19.png
Saved: ./output/group_22.png
Saved: ./output/group_21.png
Saved: ./output/group_20.png
Saved: ./output/group_23.png
Saved: ./output/group_24.png
Saved: ./output/group_25.png
Total groups found: 215
Created 27 group visualization images


In [9]:
import json
import numpy as np
from typing import Dict, List, Any, Tuple, Set
from dataclasses import dataclass
import hashlib
import logging
from collections import defaultdict
import re

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class NodeSignature:
    """Represents a compact signature of a node for comparison"""
    tag: str
    node_type: str
    has_text: bool
    text_content: str
    style_hash: str
    layout_mode: str
    children_count: int
    children_signatures: List[str]  # Ordered list of child signatures
    
    def __post_init__(self):
        # Normalize text content for comparison
        self.text_content = self.text_content.strip().lower() if self.text_content else ""
        # Create a signature string for this node
        self.signature = self._create_signature()
    
    def _create_signature(self) -> str:
        """Create a unique signature string for this node"""
        children_sig = "|".join(self.children_signatures)
        return f"{self.tag}:{self.node_type}:{self.has_text}:{self.style_hash}:{self.layout_mode}:{self.children_count}:{children_sig}"

class HierarchicalSemanticGrouper:
    """
    Advanced semantic grouper that solves the hierarchical grouping problem.
    Uses multi-level analysis to find the most meaningful component groupings.
    """
    
    def __init__(self, 
                 similarity_threshold: float = 0.8,
                 structure_weight: float = 0.6,
                 style_weight: float = 0.3,
                 content_weight: float = 0.1,
                 ignore_text_content: bool = False,
                 min_group_size: int = 2,
                 max_depth_difference: int = 2):
        """
        Initialize the hierarchical semantic grouper
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            structure_weight: Weight for structural similarity
            style_weight: Weight for style similarity  
            content_weight: Weight for content similarity
            ignore_text_content: Whether to ignore text content in comparison
            min_group_size: Minimum number of nodes to form a group
            max_depth_difference: Maximum depth difference for nodes to be grouped
        """
        self.similarity_threshold = similarity_threshold
        self.structure_weight = structure_weight
        self.style_weight = style_weight
        self.content_weight = content_weight
        self.ignore_text_content = ignore_text_content
        self.min_group_size = min_group_size
        self.max_depth_difference = max_depth_difference
        
        self.node_signatures = {}  # path -> NodeSignature
        self.node_trees = {}  # path -> tree structure
        self.node_depths = {}  # path -> depth in tree
        self.node_parents = {}  # path -> parent path
        self.similarity_matrix = None
        self.non_leaf_paths = []
        self.all_groups = {}  # All potential groups before filtering
        self.final_groups = {}  # Final groups after hierarchical filtering
        
    def extract_node_signature(self, node_data: Dict[str, Any]) -> NodeSignature:
        """Extract signature from a node"""
        node_info = node_data.get('node', {})
        tag = node_data.get('tag', '')
        name = node_data.get('name', '')
        
        # Basic properties
        node_type = node_info.get('type', '')
        has_text = bool(node_info.get('characters', ''))
        text_content = node_info.get('characters', '') if not self.ignore_text_content else ""
        layout_mode = node_info.get('layoutMode', 'NONE')
        children_count = len(node_data.get('children', []))
        
        # Style hash (simplified - you can expand this)
        style_hash = self._compute_style_hash(node_info)
        
        # Children signatures will be filled later
        children_signatures = []
        
        return NodeSignature(
            tag=tag,
            node_type=node_type,
            has_text=has_text,
            text_content=text_content,
            style_hash=style_hash,
            layout_mode=layout_mode,
            children_count=children_count,
            children_signatures=children_signatures
        )
    
    def _compute_style_hash(self, node_info: Dict[str, Any]) -> str:
        """Compute a hash representing the style properties of a node"""
        style_props = []
        
        # Text style
        text_style = node_info.get('textStyle', {})
        if text_style:
            style_props.extend([
                text_style.get('fontFamily', ''),
                str(text_style.get('fontSize', 0)),
                text_style.get('fontStyle', ''),
                str(text_style.get('fontWeight', 0))
            ])
        
        # Fill colors
        fills = node_info.get('fills', [])
        if fills:
            fill = fills[0]
            color = fill.get('color', {})
            style_props.extend([
                str(color.get('r', 0)),
                str(color.get('g', 0)),
                str(color.get('b', 0)),
                str(color.get('a', 1))
            ])
        
        # Stroke colors
        strokes = node_info.get('strokes', [])
        if strokes:
            stroke = strokes[0]
            color = stroke.get('color', {})
            style_props.extend([
                str(color.get('r', 0)),
                str(color.get('g', 0)),
                str(color.get('b', 0)),
                str(color.get('a', 1))
            ])
        
        # Layout properties
        style_props.extend([
            str(node_info.get('layoutMode', 'NONE')),
            str(node_info.get('paddingLeft', 0)),
            str(node_info.get('paddingRight', 0)),
            str(node_info.get('paddingTop', 0)),
            str(node_info.get('paddingBottom', 0)),
            str(node_info.get('itemSpacing', 0))
        ])
        
        style_string = "|".join(style_props)
        return hashlib.md5(style_string.encode()).hexdigest()[:8]
    
    def build_node_signatures(self, data: Dict[str, Any], path: str = "", depth: int = 0, parent_path: str = None) -> None:
        """Build node signatures recursively with depth and parent tracking"""
        if isinstance(data, dict) and 'node' in data:
            current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
            
            # Track depth and parent
            self.node_depths[current_path] = depth
            if parent_path:
                self.node_parents[current_path] = parent_path
            
            # Extract signature for current node
            signature = self.extract_node_signature(data)
            
            # Process children first to get their signatures
            children_signatures = []
            if 'children' in data:
                for child in data['children']:
                    child_path = f"{current_path}"
                    self.build_node_signatures(child, child_path, depth + 1, current_path)
                    child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                    if child_node_path in self.node_signatures:
                        children_signatures.append(self.node_signatures[child_node_path].signature)
            
            # Update signature with children signatures
            signature.children_signatures = children_signatures
            signature.signature = signature._create_signature()
            
            self.node_signatures[current_path] = signature
            
            # Store tree structure
            self.node_trees[current_path] = {
                'data': data,
                'children': [f"{current_path}/{child.get('name', 'unnamed')}" for child in data.get('children', [])],
                'is_leaf': len(data.get('children', [])) == 0
            }
    
    def tree_edit_distance(self, tree1_path: str, tree2_path: str) -> float:
        """
        Calculate tree edit distance between two trees
        Returns normalized distance (0 = identical, 1 = completely different)
        """
        if tree1_path not in self.node_signatures or tree2_path not in self.node_signatures:
            return 1.0
        
        sig1 = self.node_signatures[tree1_path]
        sig2 = self.node_signatures[tree2_path]
        
        # Use dynamic programming to calculate edit distance
        return self._tree_edit_distance_dp(sig1, sig2)
    
    def _tree_edit_distance_dp(self, sig1: NodeSignature, sig2: NodeSignature) -> float:
        """Dynamic programming implementation of tree edit distance"""
        
        # Node-level similarity
        node_similarity = self._node_similarity(sig1, sig2)
        
        # If nodes are very different, return high distance
        if node_similarity < 0.1:
            return 1.0
        
        # Structure similarity based on children
        children1 = sig1.children_signatures
        children2 = sig2.children_signatures
        
        if not children1 and not children2:
            # Both are leaf nodes, return distance based on node similarity
            return 1.0 - node_similarity
        
        if not children1 or not children2:
            # One is leaf, other is not
            structure_penalty = 0.5
            return 1.0 - (node_similarity * (1.0 - structure_penalty))
        
        # Both have children - calculate edit distance on children sequences
        children_distance = self._sequence_edit_distance(children1, children2)
        
        # Combine node similarity and children structure similarity
        total_similarity = (
            self.structure_weight * (1.0 - children_distance) +
            (self.style_weight + self.content_weight) * node_similarity
        )
        
        return 1.0 - total_similarity
    
    def _node_similarity(self, sig1: NodeSignature, sig2: NodeSignature) -> float:
        """Calculate similarity between two node signatures"""
        similarities = []
        
        # Tag and type similarity
        tag_sim = 1.0 if sig1.tag == sig2.tag else 0.0
        type_sim = 1.0 if sig1.node_type == sig2.node_type else 0.0
        
        # Layout similarity
        layout_sim = 1.0 if sig1.layout_mode == sig2.layout_mode else 0.0
        
        # Style similarity
        style_sim = 1.0 if sig1.style_hash == sig2.style_hash else 0.0
        
        # Content similarity
        content_sim = 1.0
        if not self.ignore_text_content:
            if sig1.has_text and sig2.has_text:
                content_sim = self._text_similarity(sig1.text_content, sig2.text_content)
            elif sig1.has_text != sig2.has_text:
                content_sim = 0.0
        
        # Children count similarity
        max_children = max(sig1.children_count, sig2.children_count)
        children_count_sim = 1.0 - abs(sig1.children_count - sig2.children_count) / max(max_children, 1)
        
        # Weighted combination
        total_similarity = (
            self.structure_weight * (tag_sim * 0.4 + type_sim * 0.3 + layout_sim * 0.3) +
            self.style_weight * style_sim +
            self.content_weight * content_sim
        ) * children_count_sim
        
        return total_similarity
    
    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using simple metrics"""
        if not text1 and not text2:
            return 1.0
        if not text1 or not text2:
            return 0.0
        
        # Exact match
        if text1 == text2:
            return 1.0
        
        # Length similarity
        max_len = max(len(text1), len(text2))
        len_sim = 1.0 - abs(len(text1) - len(text2)) / max_len
        
        # Common characters ratio
        common_chars = len(set(text1) & set(text2))
        total_chars = len(set(text1) | set(text2))
        char_sim = common_chars / max(total_chars, 1)
        
        return (len_sim + char_sim) / 2.0
    
    def _sequence_edit_distance(self, seq1: List[str], seq2: List[str]) -> float:
        """Calculate normalized edit distance between two sequences"""
        if not seq1 and not seq2:
            return 0.0
        
        if not seq1 or not seq2:
            return 1.0
        
        # Dynamic programming for edit distance
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        # Initialize base cases
        for i in range(m + 1):
            dp[i][0] = i
        for j in range(n + 1):
            dp[0][j] = j
        
        # Fill the dp table
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
        
        # Normalize by maximum possible distance
        max_distance = max(m, n)
        return dp[m][n] / max_distance if max_distance > 0 else 0.0
    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """Build similarity matrix using min edit distance approach"""
        
        # Step 1: Build node signatures with depth and parent tracking
        self.build_node_signatures(nodes_data)
        
        # Step 2: Filter non-leaf nodes for comparison
        self.non_leaf_paths = [
            path for path, tree_info in self.node_trees.items() 
            if not tree_info['is_leaf']
        ]
        
        if not self.non_leaf_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 3: Calculate similarity matrix
        n = len(self.non_leaf_paths)
        similarity_matrix = np.zeros((n, n))
        
        for i in range(n):
            for j in range(n):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                else:
                    # Only compare nodes within reasonable depth difference
                    depth_i = self.node_depths[self.non_leaf_paths[i]]
                    depth_j = self.node_depths[self.non_leaf_paths[j]]
                    
                    if abs(depth_i - depth_j) <= self.max_depth_difference:
                        # Calculate edit distance and convert to similarity
                        edit_distance = self.tree_edit_distance(
                            self.non_leaf_paths[i], 
                            self.non_leaf_paths[j]
                        )
                        similarity_matrix[i][j] = 1.0 - edit_distance
                    else:
                        similarity_matrix[i][j] = 0.0  # Too different in depth
        
        self.similarity_matrix = similarity_matrix
        return similarity_matrix
    
    def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
        """Find groups of similar nodes with hierarchical filtering"""
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")
        
        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}
        
        threshold = threshold or self.similarity_threshold
        
        # Step 1: Find all potential similarity groups
        self.all_groups = self._find_all_similarity_groups(threshold)
        
        # Step 2: Apply hierarchical filtering to get the best groups
        self.final_groups = self._apply_hierarchical_filtering(self.all_groups)
        
        return self.final_groups
    
    def _find_all_similarity_groups(self, threshold: float) -> Dict[str, List[str]]:
        """Find all potential connected components of similar nodes"""
        n = len(self.non_leaf_paths)
        visited = [False] * n
        groups = {}
        group_counter = 0
        
        def dfs(node_idx: int, group: List[int]):
            visited[node_idx] = True
            group.append(node_idx)
            
            for j in range(n):
                if not visited[j] and self.similarity_matrix[node_idx][j] >= threshold:
                    dfs(j, group)
        
        for i in range(n):
            if not visited[i]:
                group = []
                dfs(i, group)
                
                if len(group) >= self.min_group_size:
                    group_id = f"group_{group_counter}"
                    groups[group_id] = [self.non_leaf_paths[idx] for idx in group]
                    group_counter += 1
        
        return groups
    
    def _apply_hierarchical_filtering(self, all_groups: Dict[str, List[str]]) -> Dict[str, List[str]]:
        """
        Apply hierarchical filtering to resolve conflicts and find optimal groupings.
        Solves the 4x4 cards problem by choosing the most meaningful groups.
        """
        
        # Step 1: Calculate group quality scores
        group_scores = self._calculate_group_scores(all_groups)
        
        # Step 2: Detect hierarchical conflicts
        conflicts = self._detect_hierarchical_conflicts(all_groups)
        
        # Step 3: Resolve conflicts by choosing optimal groups
        final_groups = self._resolve_conflicts(all_groups, group_scores, conflicts)
        
        return final_groups
    
    def _calculate_group_scores(self, groups: Dict[str, List[str]]) -> Dict[str, float]:
        """Calculate quality scores for each group"""
        scores = {}
        
        for group_id, node_paths in groups.items():
            if len(node_paths) < 2:
                scores[group_id] = 0.0
                continue
            
            # Factors for scoring:
            # 1. Group size (more similar nodes = better)
            size_score = len(node_paths)
            
            # 2. Average similarity within group
            similarities = []
            path_to_idx = {self.non_leaf_paths[i]: i for i in range(len(self.non_leaf_paths))}
            
            for i, path1 in enumerate(node_paths):
                for j, path2 in enumerate(node_paths):
                    if i != j and path1 in path_to_idx and path2 in path_to_idx:
                        idx1, idx2 = path_to_idx[path1], path_to_idx[path2]
                        similarities.append(self.similarity_matrix[idx1][idx2])
            
            avg_similarity = np.mean(similarities) if similarities else 0.0
            
            # 3. Depth consistency (nodes at similar depths are better)
            depths = [self.node_depths[path] for path in node_paths]
            depth_variance = np.var(depths) if len(depths) > 1 else 0.0
            depth_consistency = 1.0 / (1.0 + depth_variance)
            
            # 4. Structural complexity (more complex components are more valuable)
            complexity_scores = []
            for path in node_paths:
                if path in self.node_signatures:
                    complexity = self.node_signatures[path].children_count
                    complexity_scores.append(complexity)
            
            avg_complexity = np.mean(complexity_scores) if complexity_scores else 0.0
            
            # Combine scores
            total_score = (
                size_score * 2.0 +          # Size is important
                avg_similarity * 3.0 +      # Similarity is most important
                depth_consistency * 1.0 +   # Depth consistency helps
                avg_complexity * 1.5        # Complex components are valuable
            )
            
            scores[group_id] = total_score
        
        return scores
    
    def _detect_hierarchical_conflicts(self, groups: Dict[str, List[str]]) -> List[Tuple[str, str]]:
        """Detect when groups have hierarchical relationships (parent-child conflicts)"""
        conflicts = []
        
        group_items = list(groups.items())
        for i, (group1_id, group1_paths) in enumerate(group_items):
            for j, (group2_id, group2_paths) in enumerate(group_items):
                if i >= j:  # Avoid duplicate comparisons
                    continue
                
                # Check if any node in group1 is ancestor/descendant of any node in group2
                conflict_found = False
                for path1 in group1_paths:
                    for path2 in group2_paths:
                        if self._is_ancestor_descendant(path1, path2):
                            conflict_found = True
                            break
                    if conflict_found:
                        break
                
                if conflict_found:
                    conflicts.append((group1_id, group2_id))
        
        return conflicts
    
    def _is_ancestor_descendant(self, path1: str, path2: str) -> bool:
        """Check if path1 is ancestor or descendant of path2"""
        # Simple check: one path is a prefix of the other
        return path1.startswith(path2 + "/") or path2.startswith(path1 + "/")
    
    def _resolve_conflicts(self, all_groups: Dict[str, List[str]], 
                          group_scores: Dict[str, float], 
                          conflicts: List[Tuple[str, str]]) -> Dict[str, List[str]]:
        """Resolve conflicts by choosing the best groups"""
        
        # Build conflict graph
        conflict_graph = defaultdict(set)
        for group1, group2 in conflicts:
            conflict_graph[group1].add(group2)
            conflict_graph[group2].add(group1)
        
        # Greedy selection: choose groups with highest scores that don't conflict
        selected_groups = {}
        excluded_groups = set()
        
        # Sort groups by score (descending)
        sorted_groups = sorted(group_scores.items(), key=lambda x: x[1], reverse=True)
        
        for group_id, score in sorted_groups:
            if group_id in excluded_groups:
                continue
            
            # Check if this group conflicts with any already selected group
            conflicts_with_selected = any(
                conflicting_group in selected_groups 
                for conflicting_group in conflict_graph[group_id]
            )
            
            if not conflicts_with_selected:
                # Select this group
                selected_groups[group_id] = all_groups[group_id]
                
                # Mark conflicting groups as excluded
                for conflicting_group in conflict_graph[group_id]:
                    excluded_groups.add(conflicting_group)
        
        return selected_groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """Add node_id to each node based on similarity groups"""
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        return self._add_node_ids_recursive(original_data, path_to_group)
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:
                    # Leaf nodes get unique IDs
                    result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data:
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data
    
    def get_analysis_info(self) -> Dict[str, Any]:
        """Get detailed analysis information for debugging"""
        if not self.node_signatures:
            return {"error": "No analysis performed yet"}
        
        leaf_count = sum(1 for tree_info in self.node_trees.values() if tree_info['is_leaf'])
        non_leaf_count = len(self.node_trees) - leaf_count
        
        # Analyze groups
        all_groups_info = {}
        if self.all_groups:
            for group_id, paths in self.all_groups.items():
                depths = [self.node_depths[path] for path in paths]
                all_groups_info[group_id] = {
                    "size": len(paths),
                    "depths": depths,
                    "avg_depth": np.mean(depths),
                    "depth_variance": np.var(depths)
                }
        
        final_groups_info = {}
        if self.final_groups:
            for group_id, paths in self.final_groups.items():
                depths = [self.node_depths[path] for path in paths]
                final_groups_info[group_id] = {
                    "size": len(paths),
                    "depths": depths,
                    "avg_depth": np.mean(depths),
                    "depth_variance": np.var(depths)
                }
        
        return {
            "total_nodes": len(self.node_signatures),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "comparison_approach": "hierarchical_min_edit_distance",
            "weights": {
                "structure": self.structure_weight,
                "style": self.style_weight,
                "content": self.content_weight
            },
            "parameters": {
                "similarity_threshold": self.similarity_threshold,
                "min_group_size": self.min_group_size,
                "max_depth_difference": self.max_depth_difference,
                "ignore_text_content": self.ignore_text_content
            },
            "all_groups_found": len(self.all_groups) if self.all_groups else 0,
            "final_groups_selected": len(self.final_groups) if self.final_groups else 0,
            "all_groups_details": all_groups_info,
            "final_groups_details": final_groups_info
        }

# Example usage and testing
def rename_names_and_store_mapping(node, counter, mapping):
    """Utility function to rename nodes for testing"""
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

# Main execution example
if __name__ == "__main__":
    # Load and preprocess data
    with open("PAGE_109.json", "r") as f:
        data = json.load(f)
    
    # Rename names and save mapping
    name_mapping = {}
    rename_names_and_store_mapping(data, [0], name_mapping)
    
    # Save modified JSON and mapping
    with open("modified_hierarchical.json", "w") as f:
        json.dump(data, f, indent=2)
    
    with open("name_mapping_hierarchical.json", "w") as f:
        json.dump(name_mapping, f, indent=2)
    
    # Initialize the hierarchical semantic grouper
    grouper = HierarchicalSemanticGrouper(
        similarity_threshold=0.85,
        structure_weight=0.6,
        style_weight=0.3,
        content_weight=0.1,
        ignore_text_content=False,
        min_group_size=2,
        max_depth_difference=2
    )
    
    # Build similarity matrix
    similarity_matrix = grouper.build_similarity_matrix(data)
    
    # Find similar groups with hierarchical filtering
    similarity_groups = grouper.check_similarity()
    print("Final similarity groups found:", similarity_groups)
    
    # Add node_ids to original JSON
    result_json = grouper.add_node_ids_to_json(data, similarity_groups)
    
    # Save result
    with open("result_hierarchical.json", "w") as f:
        json.dump(result_json, f, indent=2)
    
    # Print detailed analysis info
    analysis = grouper.get_analysis_info()
    print("Detailed analysis:")
    print(json.dumps(analysis, indent=2))

Final similarity groups found: {'group_1': ['name0/name53/name54/name58/name60', 'name0/name83/name89/name93/name95', 'name0/name101/name102/name114/name132/name133/name134', 'name0/name101/name102/name114/name145/name147', 'name0/name101/name149/name161/name179/name180/name181', 'name0/name101/name149/name161/name192/name194', 'name0/name241/name242/name244/name247/name248', 'name0/name241/name242/name244/name247/name250', 'name0/name241/name242/name244/name247/name252', 'name0/name241/name254/name256/name259/name260', 'name0/name241/name254/name256/name259/name262', 'name0/name241/name254/name256/name259/name264', 'name0/name241/name266/name268/name271/name272', 'name0/name241/name266/name268/name271/name274', 'name0/name241/name266/name268/name271/name276', 'name0/name241/name278/name280/name283/name284', 'name0/name241/name278/name280/name283/name286', 'name0/name241/name278/name280/name283/name288', 'name0/name290/name300/name346/name348/name349', 'name0/name290/name300/name346/na

In [10]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import json
import random
from typing import Dict, List, Any

def visualize_groups_on_svg(svg_file_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Read SVG image and create separate images for each group with rectangles around components.
    
    Args:
        svg_file_path: Path to the SVG file
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert SVG to PNG first (you'll need to install cairosvg: pip install cairosvg)
    try:
        import cairosvg
        png_data = cairosvg.svg2png(url=svg_file_path)
        base_image = Image.open(io.BytesIO(png_data))
    except ImportError:
        print("cairosvg not installed. Please install it: pip install cairosvg")
        return
    except Exception as e:
        print(f"Error converting SVG: {e}")
        return
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                abs_box = node.get('node', {}).get('absoluteBoundingBox', {})
                if abs_box and 'node_id' in node:
                    x = abs_box.get('x', 0)
                    y = abs_box.get('y', 0)
                    width = abs_box.get('width', 0)
                    height = abs_box.get('height', 0)
                    
                    group_id = node.get('node_id', 'no_group')
                    
                    if group_id not in node_groups:
                        node_groups[group_id] = []
                    
                    node_groups[group_id].append({
                        'name': node.get('name', 'unnamed'),
                        'x': x,
                        'y': y,
                        'width': width,
                        'height': height
                    })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate random colors for each group
    colors = {}
    for group_id in node_groups.keys():
        colors[group_id] = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255)
        )
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = colors[group_id]
        for node in nodes:
            x, y, width, height = node['x'], node['y'], node['width'], node['height']
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=3
            )
            
            # Add group label
            draw.text((x, y - 20), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")


# Alternative simpler version without SVG conversion (if you have PNG/JPG)
def visualize_groups_on_image(image_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Simpler version that works with PNG/JPG images directly.
    
    Args:
        image_path: Path to the image file (PNG, JPG, etc.)
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Load base image
    base_image = Image.open(image_path)
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                node_info = node.get('node', {})
                x = node_info.get('x', 0)
                y = node_info.get('y', 0)
                width = node_info.get('width', 0)
                height = node_info.get('height', 0)
                
                group_id = node.get('node_id', 'no_group')
                
                if group_id not in node_groups:
                    node_groups[group_id] = []
                
                node_groups[group_id].append({
                    'name': node.get('name', 'unnamed'),
                    'x': x,
                    'y': y,
                    'width': width,
                    'height': height
                })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate colors for each group
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (255, 165, 0),  # Orange
        (128, 0, 128),  # Purple
    ]
    
    color_index = 0
    group_colors = {}
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Assign color to group
        if group_id not in group_colors:
            group_colors[group_id] = colors[color_index % len(colors)]
            color_index += 1
        
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = group_colors[group_id]
        for node in nodes:
            x, y, width, height = int(node['x']), int(node['y']), int(node['width']), int(node['height'])
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=4
            )
            
            # Add group label
            draw.text((x, max(0, y - 25)), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Total groups found: {len(node_groups)}")
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")



with open('result_hierarchical.json', 'r') as f:
    json_data = json.load(f)

# For PNG/JPG images (simpler):
visualize_groups_on_image('PAGE_109.png', json_data)

# For SVG images (requires cairosvg):
# visualize_groups_on_svg('your_image.svg', json_data)

Saved: ./output/group_0.png
Saved: ./output/group_1.png
Saved: ./output/group_4.png
Saved: ./output/group_10.png
Saved: ./output/group_16.png
Saved: ./output/group_17.png
Saved: ./output/group_20.png
Saved: ./output/group_22.png
Saved: ./output/group_27.png
Saved: ./output/group_30.png
Saved: ./output/group_32.png
Saved: ./output/group_36.png
Saved: ./output/group_37.png
Saved: ./output/group_39.png
Total groups found: 310
Created 14 group visualization images


In [22]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import hashlib
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FigmaNodeSimilarityDetector:
    """
    A flexible similarity detection system for Figma nodes using bottom-up approach.
    Feature vectors are extracted directly from leaf nodes and propagated upward.
    """
    
    def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
        """
        Initialize the similarity detector
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            use_semantic_embeddings: Whether to use semantic embeddings for text content
        """
        self.similarity_threshold = similarity_threshold
        self.use_semantic_embeddings = use_semantic_embeddings
        self.node_feature_vectors = {}  # Store feature vectors directly
        self.node_metadata = {}  # Store metadata (is_leaf, etc.)
        self.similarity_matrix = None
        self.clusters = None
        self.node_tree = {}  # Store hierarchical structure
        
        # Initialize semantic model if needed
        if use_semantic_embeddings:
            try:
                self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
                logger.info("Loaded semantic embedding model")
            except Exception as e:
                logger.warning(f"Could not load semantic model: {e}")
                self.use_semantic_embeddings = False
    
    def extract_leaf_node_feature_vector(self, node_data: Dict[str, Any], node_path: str = "") -> np.ndarray:
        """
        Extract feature vector directly from a leaf Figma node

        Args:
            node_data: The node data dictionary
            node_path: Path to the node in the tree

        Returns:
            Numpy array containing the feature vector
        """
        node_info = node_data.get('node', {})
        tag = node_data.get('tag', '')
        
        # Check if tag is ICON or SVG - if so, return default vector of ones
        if tag in ["ICON", "SVG"]:
            # Create default vector with ones
            default_size = 17  # structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)
        
        vector = []

        # 1. Structural Features (normalized)
        node_type = node_info.get('type', '')
        has_children = len(node_data.get('children', [])) > 0
        num_children = len(node_data.get('children', []))

        vector.extend([
            hash(node_type) % 1000 / 1000.0,  # Normalize hash
            # hash(tag) % 1000 / 1000.0,
            float(has_children),
            min(num_children / 10000.0, 1.0),  # Normalize to 0-1
        ])

        # 2. Style Features
        # Extract font information from textStyle
        text_style = node_info.get('textStyle', {})
        font_family = text_style.get('fontFamily', '')
        font_size = text_style.get('fontSize', 0)
        font_style = text_style.get('fontStyle', '')
        font_weight = text_style.get('fontWeight', 0)

        # Extract fill colors
        fills = node_info.get('fills', [])
        fill_r = fill_g = fill_b = fill_a = 0
        if fills:
            primary_fill = fills[0]
            color = primary_fill.get('color', {})
            fill_r = color.get('r', 0)
            fill_g = color.get('g', 0)
            fill_b = color.get('b', 0)
            fill_a = color.get('a', 1)

        # Extract stroke colors
        strokes = node_info.get('strokes', [])
        stroke_r = stroke_g = stroke_b = stroke_a = 0
        if strokes:
            primary_stroke = strokes[0]
            stroke_color = primary_stroke.get('color', {})
            stroke_r = stroke_color.get('r', 0)
            stroke_g = stroke_color.get('g', 0)
            stroke_b = stroke_color.get('b', 0)
            stroke_a = stroke_color.get('a', 1)

        vector.extend([
            hash(font_family) % 1000 / 1000.0 if font_family else 0,
            min(font_size / 10000.0, 1.0) if font_size else 0,  # Normalize font size to 0-1 (assuming max ~100px)
            hash(font_style) % 1000 / 1000.0 if font_style else 0,
            min(font_weight / 10000.0, 1.0) if font_weight else 0,  # Normalize font weight (max ~900-1000)
            fill_r, fill_g, fill_b, fill_a,
            stroke_r, stroke_g, stroke_b, stroke_a,
        ])

        # 3. Content Features
        text_content = node_info.get('characters', '')
        name = node_data.get('name', '')
        has_text = bool(text_content)

        vector.extend([
            float(has_text),
        ])

        # 4. Semantic Features (using embeddings) - commented out for now
        # if self.use_semantic_embeddings:
        #     text_for_embedding = f"{name} {node_type}"
        #     if text_for_embedding.strip():
        #         try:
        #             semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
        #             vector.extend(semantic_features)
        #         except:
        #             vector.extend([0.0] * 384)  # Default embedding size
        #     else:
        #         vector.extend([0.0] * 384)
        # else:
        #     vector.extend([0.0] * 384)

        # 5. Descendant Count Feature (for leaf nodes, this is 0)
        total_descendants = 0  # Leaf nodes have no descendants
        vector.extend([
            total_descendants
        ])

        return np.array(vector)

    def create_parent_feature_vector(self, children_vectors: List[np.ndarray], node_data: Dict[str, Any]) -> np.ndarray:
        """
        Create parent node feature vector by averaging children feature vectors

        Args:
            children_vectors: List of children feature vectors
            node_data: Parent node data

        Returns:
            Feature vector for parent node
        """
        tag = node_data.get('tag', '')
        
        # Check if tag is ICON or SVG - if so, return default vector of ones
        if tag in ["ICON", "SVG"]:
            # Create default vector with ones
            default_size = 17  # structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)
        
        if not children_vectors:
            # If no children vectors, create a default vector
            default_size = 0 + 17  # Updated size: structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)

        # Average all children feature vectors
        parent_vector = np.mean(children_vectors, axis=0)

        # Override some structural features specific to the parent
        node_info = node_data.get('node', {})
        node_type = node_info.get('type', '')
        num_children = len(children_vectors)

        # Update structural features (first 3 elements)
        parent_vector[0] = hash(node_type) % 1000 / 1000.0
        # parent_vector[1] = hash(tag) % 1000 / 1000.0
        parent_vector[1] = 1.0  # Parent always has children
        parent_vector[2] = min(num_children / 10000.0, 1.0)

        # Calculate total descendants
        total_descendants = num_children
        for child_vector in children_vectors:
            # The descendant count is the last feature in the vector
            child_descendants = int(child_vector[-1] * 10000)  # Denormalize
            total_descendants += child_descendants

        # Update the descendant count feature (last element)
        parent_vector[-1] = min(total_descendants / 10000.0, 1.0)

        return parent_vector

    def get_total_descendants_count(self, node_vector: np.ndarray) -> int:
        """
        Extract the total descendants count from a node's feature vector

        Args:
            node_vector: The feature vector of a node

        Returns:
            Total number of descendants for this node
        """
        # The descendant count is the last feature, denormalized
        return int(node_vector[-1] * 10000)
    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """
        Build similarity matrix using bottom-up approach.
        Feature vectors are extracted from leaves and propagated upward.
        
        Args:
            nodes_data: Dictionary containing all node data
            
        Returns:
            Similarity matrix as numpy array
        """
        # Step 1: Build the tree structure and identify leaf nodes
        self.node_tree = self._build_node_tree(nodes_data)
        
        # Step 2: Extract feature vectors bottom-up
        self.node_feature_vectors = {}
        self.node_metadata = {}
        self._extract_feature_vectors_bottom_up(nodes_data)
        
        # Step 3: Filter out leaf nodes for similarity calculation (only compare non-leaf nodes)
        non_leaf_paths = [path for path, metadata in self.node_metadata.items() if not metadata['is_leaf']]
        self.node_paths = non_leaf_paths
        
        if not self.node_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 4: Get feature vectors for non-leaf nodes only
        feature_vectors = []
        for node_path in self.node_paths:
            vector = self.node_feature_vectors[node_path]
            feature_vectors.append(vector)
        
        feature_vectors = np.array(feature_vectors)
        
        # Step 5: Calculate similarity matrix using cosine similarity
        self.similarity_matrix = cosine_similarity(feature_vectors)
        
        return self.similarity_matrix
    
    def _build_node_tree(self, data: Dict[str, Any], path: str = "", parent_path: str = None) -> Dict[str, Dict]:
        """Build a tree structure mapping node paths to their metadata"""
        tree = {}
        
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                tree[current_path] = {
                    'data': data,
                    'parent': parent_path,
                    'children': [],
                    'is_leaf': len(data.get('children', [])) == 0
                }
                
                if 'children' in data:
                    for child in data['children']:
                        child_tree = self._build_node_tree(child, current_path, current_path)
                        tree.update(child_tree)
                        # Add child paths to current node
                        for child_path in child_tree.keys():
                            if child_tree[child_path]['parent'] == current_path:
                                tree[current_path]['children'].append(child_path)
        
        return tree
    
    def _extract_feature_vectors_bottom_up(self, data: Dict[str, Any], path: str = ""):
        """Extract feature vectors using bottom-up approach"""
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # First, process all children
                children_vectors = []
                if 'children' in data:
                    for child in data['children']:
                        child_path = f"{current_path}"
                        self._extract_feature_vectors_bottom_up(child, child_path)
                        
                        # Get child path and vector
                        child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                        if child_node_path in self.node_feature_vectors:
                            children_vectors.append(self.node_feature_vectors[child_node_path])
                
                # Extract feature vector for current node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:  # Leaf node
                    self.node_feature_vectors[current_path] = self.extract_leaf_node_feature_vector(data, current_path)
                else:  # Parent node - aggregate children vectors
                    self.node_feature_vectors[current_path] = self.create_parent_feature_vector(children_vectors, data)
                
                # Store metadata
                self.node_metadata[current_path] = {
                    'is_leaf': is_leaf,
                    'num_children': len(children_vectors),
                    'node_type': data.get('node', {}).get('type', ''),
                    'name': data.get('name', '')
                }
    
    def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
        """
        Check similarity between non-leaf nodes and return groups of similar nodes.
        Leaf nodes are excluded from grouping.
        
        Args:
            threshold: Similarity threshold (uses instance threshold if None)
            
        Returns:
            Dictionary mapping group_id to list of similar node paths (non-leaf nodes only)
        """
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")
        
        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}
        
        threshold = threshold or self.similarity_threshold
        
        # Method 1: Simple threshold-based grouping (only for non-leaf nodes)
        similarity_groups = self._threshold_based_grouping(threshold)
        
        return similarity_groups
    
    def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
        """Threshold-based similarity grouping for non-leaf nodes only"""
        groups = {}
        assigned = set()
        group_counter = 0
        
        for i in range(len(self.node_paths)):
            if self.node_paths[i] in assigned:
                continue
                
            # Find all nodes similar to current node
            similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
            similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
            if similar_nodes:  # If we found similar nodes
                group_id = f"group_{group_counter}"
                groups[group_id] = [self.node_paths[i]] + similar_nodes
                assigned.update(groups[group_id])
                group_counter += 1
        
        return groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """
        Add node_id to each node in the original JSON based on similarity groups.
        Only non-leaf nodes get group IDs, leaf nodes get unique IDs.
        
        Args:
            original_data: Original JSON data
            similarity_groups: Groups of similar nodes (non-leaf only)
            
        Returns:
            Modified JSON with node_id added to each node
        """
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
        return modified_data
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:
                    # Leaf nodes get unique IDs
                    result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data:
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data

    def get_feature_vector_info(self) -> Dict[str, Any]:
        """
        Get information about the feature vectors for debugging/analysis
        
        Returns:
            Dictionary containing feature vector statistics
        """
        if not self.node_feature_vectors:
            return {"error": "No feature vectors extracted yet"}
        
        vector_lengths = [len(v) for v in self.node_feature_vectors.values()]
        leaf_count = sum(1 for metadata in self.node_metadata.values() if metadata['is_leaf'])
        non_leaf_count = len(self.node_metadata) - leaf_count
        
        return {
            "total_nodes": len(self.node_feature_vectors),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "feature_vector_length": vector_lengths[0] if vector_lengths else 0,
            "feature_breakdown": {
                "structural_features": 3,
                "style_features": 12,  # Updated: font_family, font_size, font_style, font_weight + colors
                "content_features": 1,
                "descendant_count": 1,
                "semantic_features": 384 if self.use_semantic_embeddings else 0
            }
        }

    def print_figma_tree_with_vectors(self, node, depth=0, path=""):
        """
        Print the Figma node tree with feature vectors
        
        Args:
            node: The current node to print
            depth: Current depth in the tree
            path: Current path to the node
        """
        indent = "  " * depth  # 2 spaces per level

        # Extract info
        name = node.get("name", "[no name]")
        tag = node.get("tag", "[no tag]")
        node_id = node.get("node_id", "")
        
        # Handle TEXT nodes with characters
        node_data = node.get("node", {})
        characters = node_data.get("characters", "")
        is_text = tag == "TEXT"
        display_name = characters[:10] + "..." if is_text and characters else name

        # Layout info (if present)
        layout = node_data.get("layoutMode", "NONE")
        layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

        # Get the current node path
        current_path = f"{path}/{name}" if path else name
        
        # Get feature vector if available
        vector_str = ""
        if current_path in self.node_feature_vectors:
            vector = self.node_feature_vectors[current_path]
            # Format vector to show first few values and some key features
        if len(vector) > 0:
            vector_str = f" | Vector: [{', '.join(f'{v:.7f}' for v in vector)}] (len={len(vector)})"
        else:
            vector_str = " | Vector: Not found"

        # Print current node info with feature vector
        print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id}){vector_str}")

        # Recursively print children
        for child in node.get("children", []):
            self.print_figma_tree_with_vectors(child, depth + 1, current_path)



            


import json

def rename_names_and_store_mapping(node, counter, mapping):
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

# Load input JSON
with open("PAGE_109.json", "r") as f:
    data = json.load(f)

# Rename names and save mapping
name_mapping = {}
rename_names_and_store_mapping(data, [0], name_mapping)

# Save modified JSON and mapping
with open("modified.json", "w") as f:
    json.dump(data, f, indent=2)

with open("name_mapping.json", "w") as f:
    json.dump(name_mapping, f, indent=2)





# Initialize the detector
detector = FigmaNodeSimilarityDetector(
    similarity_threshold=0.99999999999,
    use_semantic_embeddings=True
)

# Load your JSON data
with open('modified.json', 'r') as f:
    figma_data = json.load(f)

# Build similarity matrix
similarity_matrix = detector.build_similarity_matrix(figma_data)

# Find similar nodes
similarity_groups = detector.check_similarity()
print(similarity_groups)
# Add node_ids to original JSON
result_json = detector.add_node_ids_to_json(figma_data, similarity_groups)



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loaded semantic embedding model


{'group_0': ['name0/name1/name2/name3/name5/name6', 'name0/name1/name2/name3/name5/name8', 'name0/name1/name2/name3/name5/name10', 'name0/name1/name2/name3/name5/name12'], 'group_1': ['name0/name1/name2/name19/name20/name24/name25', 'name0/name101/name102/name103/name104/name105', 'name0/name101/name149/name150/name151/name152'], 'group_2': ['name0/name53/name54/name58', 'name0/name83/name89/name93'], 'group_3': ['name0/name53/name54', 'name0/name83/name89'], 'group_4': ['name0/name53/name62/name63', 'name0/name53/name62/name72'], 'group_5': ['name0/name97', 'name0/name196'], 'group_6': ['name0/name101/name102/name103/name104/name107', 'name0/name101/name149/name150/name151/name154'], 'group_7': ['name0/name101/name102/name103/name104', 'name0/name101/name149/name150/name151'], 'group_8': ['name0/name101/name102/name103/name109', 'name0/name101/name149/name150/name156'], 'group_9': ['name0/name101/name102/name103/name111', 'name0/name101/name149/name150/name158'], 'group_10': ['name0/n

In [12]:
import json
import uuid
import re

def generate_unique_id():
    """Generate a unique ID similar to the existing format"""
    return f"unique_{uuid.uuid4().hex[:8]}"

def is_group_node(node_id):
    """Check if node_id matches the group_X pattern"""
    return bool(re.match(r'^group_\d+$', node_id))

def convert_children_ids(node, visited=None):
    """Recursively convert all children node_ids to unique IDs"""
    if visited is None:
        visited = set()
    
    # Avoid infinite recursion
    if id(node) in visited:
        return
    visited.add(id(node))
    
    # Process children if they exist
    if 'children' in node and isinstance(node['children'], list):
        for child in node['children']:
            # Convert child's node_id if it exists
            if 'node_id' in child:
                child['node_id'] = generate_unique_id()
            
            # Recursively process the child's children
            convert_children_ids(child, visited)

def process_json_tree(data):
    """Process the entire JSON tree looking for group nodes"""
    def traverse(node):
        # Check if current node has node_id with group_X pattern
        if isinstance(node, dict) and 'node_id' in node and is_group_node(node['node_id']):
            print(f"Found group node: {node['node_id']}")
            print("Converting all children node_ids to unique IDs...")
            convert_children_ids(node)
        
        # Recursively traverse children
        if isinstance(node, dict) and 'children' in node:
            for child in node['children']:
                traverse(child)
        elif isinstance(node, list):
            for item in node:
                traverse(item)
    
    traverse(data)
    return data

def main():
    # Example usage
    try:
        # Read JSON from file
        with open('modified.json', 'r') as f:
            data = json.load(f)
        
        print("Processing JSON tree...")
        processed_data = process_json_tree(data)
        
        # Save the processed data
        with open('output.json', 'w', encoding='utf-8') as file:
            json.dump(processed_data, file, indent=2)
        
        print("Processing complete! Output saved to 'output.json'")
        
    except FileNotFoundError:
        print("Error: 'paste.txt' file not found")
    except json.JSONDecodeError:
        print("Error: Invalid JSON format")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Processing JSON tree...
Found group node: group_0
Converting all children node_ids to unique IDs...
Found group node: group_0
Converting all children node_ids to unique IDs...
Found group node: group_0
Converting all children node_ids to unique IDs...
Found group node: group_0
Converting all children node_ids to unique IDs...
Found group node: group_1
Converting all children node_ids to unique IDs...
Found group node: group_3
Converting all children node_ids to unique IDs...
Found group node: group_4
Converting all children node_ids to unique IDs...
Found group node: group_4
Converting all children node_ids to unique IDs...
Found group node: group_3
Converting all children node_ids to unique IDs...
Found group node: group_5
Converting all children node_ids to unique IDs...
Found group node: group_19
Converting all children node_ids to unique IDs...
Found group node: group_19
Converting all children node_ids to unique IDs...
Found group node: group_5
Converting all children node_ids to 

In [24]:

with open("modified.json", "w") as f:
    json.dump(result_json, f, indent=2)

import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import json
import random
from typing import Dict, List, Any

def visualize_groups_on_svg(svg_file_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Read SVG image and create separate images for each group with rectangles around components.
    
    Args:
        svg_file_path: Path to the SVG file
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert SVG to PNG first (you'll need to install cairosvg: pip install cairosvg)
    try:
        import cairosvg
        png_data = cairosvg.svg2png(url=svg_file_path)
        base_image = Image.open(io.BytesIO(png_data))
    except ImportError:
        print("cairosvg not installed. Please install it: pip install cairosvg")
        return
    except Exception as e:
        print(f"Error converting SVG: {e}")
        return
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                abs_box = node.get('node', {}).get('absoluteBoundingBox', {})
                if abs_box and 'node_id' in node:
                    x = abs_box.get('x', 0)
                    y = abs_box.get('y', 0)
                    width = abs_box.get('width', 0)
                    height = abs_box.get('height', 0)
                    
                    group_id = node.get('node_id', 'no_group')
                    
                    if group_id not in node_groups:
                        node_groups[group_id] = []
                    
                    node_groups[group_id].append({
                        'name': node.get('name', 'unnamed'),
                        'x': x,
                        'y': y,
                        'width': width,
                        'height': height
                    })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate random colors for each group
    colors = {}
    for group_id in node_groups.keys():
        colors[group_id] = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255)
        )
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = colors[group_id]
        for node in nodes:
            x, y, width, height = node['x'], node['y'], node['width'], node['height']
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=3
            )
            
            # Add group label
            draw.text((x, y - 20), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")


# Alternative simpler version without SVG conversion (if you have PNG/JPG)
def visualize_groups_on_image(image_path: str, json_data: Dict[str, Any], output_dir: str = "./output"):
    """
    Simpler version that works with PNG/JPG images directly.
    
    Args:
        image_path: Path to the image file (PNG, JPG, etc.)
        json_data: JSON data containing nodes with coordinates and group_ids
        output_dir: Directory to save output images
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    # Load base image
    base_image = Image.open(image_path)
    
    # Extract node coordinates and group information
    node_groups = {}
    
    def extract_nodes_recursive(node, path=""):
        if isinstance(node, dict):
            if 'node' in node:
                current_path = f"{path}/{node.get('name', 'unnamed')}" if path else node.get('name', 'root')
                
                # Get coordinates from absoluteBoundingBox
                node_info = node.get('node', {})
                x = node_info.get('x', 0)
                y = node_info.get('y', 0)
                width = node_info.get('width', 0)
                height = node_info.get('height', 0)
                
                group_id = node.get('node_id', 'no_group')
                
                if group_id not in node_groups:
                    node_groups[group_id] = []
                
                node_groups[group_id].append({
                    'name': node.get('name', 'unnamed'),
                    'x': x,
                    'y': y,
                    'width': width,
                    'height': height
                })
                
                # Process children
                for child in node.get('children', []):
                    extract_nodes_recursive(child, current_path)
    
    extract_nodes_recursive(json_data)
    
    # Generate colors for each group
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (255, 165, 0),  # Orange
        (128, 0, 128),  # Purple
    ]
    
    color_index = 0
    group_colors = {}
    
    # Create image for each group
    for group_id, nodes in node_groups.items():
        if len(nodes) <= 1:  # Skip groups with only one node
            continue
            
        # Assign color to group
        if group_id not in group_colors:
            group_colors[group_id] = colors[color_index % len(colors)]
            color_index += 1
        
        # Create a copy of the base image
        group_image = base_image.copy()
        draw = ImageDraw.Draw(group_image)
        
        # Draw rectangles around nodes in this group
        color = group_colors[group_id]
        for node in nodes:
            x, y, width, height = int(node['x']), int(node['y']), int(node['width']), int(node['height'])
            
            # Draw rectangle outline
            draw.rectangle(
                [(x, y), (x + width, y + height)],
                outline=color,
                width=4
            )
            
            # Add group label
            draw.text((x, max(0, y - 25)), f"{group_id}", fill=color)
        
        # Save the image
        output_path = f"{output_dir}/group_{group_id}.png"
        group_image.save(output_path)
        print(f"Saved: {output_path}")
    
    print(f"Total groups found: {len(node_groups)}")
    print(f"Created {len([g for g in node_groups.values() if len(g) > 1])} group visualization images")



# with open('output.json', 'r') as f:
#     json_data = json.load(f)

# For PNG/JPG images (simpler):
visualize_groups_on_image('PAGE_109.png', json_data)

# For SVG images (requires cairosvg):
# visualize_groups_on_svg('your_image.svg', json_data)

Saved: ./output/group_group_0.png
Saved: ./output/group_no_group.png
Saved: ./output/group_group_2.png
Saved: ./output/group_group_1.png
Saved: ./output/group_group_3.png
Saved: ./output/group_group_4.png
Saved: ./output/group_group_14.png
Saved: ./output/group_group_13.png
Saved: ./output/group_group_8.png
Saved: ./output/group_group_6.png
Saved: ./output/group_group_5.png
Saved: ./output/group_group_7.png
Saved: ./output/group_group_9.png
Saved: ./output/group_group_11.png
Saved: ./output/group_group_10.png
Saved: ./output/group_group_12.png
Saved: ./output/group_group_18.png
Saved: ./output/group_group_16.png
Saved: ./output/group_group_15.png
Saved: ./output/group_group_17.png
Saved: ./output/group_group_19.png
Saved: ./output/group_group_22.png
Saved: ./output/group_group_21.png
Saved: ./output/group_group_20.png
Saved: ./output/group_group_23.png
Saved: ./output/group_group_24.png
Saved: ./output/group_group_25.png
Total groups found: 215
Created 27 group visualization images
