In [None]:
# import json
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import DBSCAN
# from sentence_transformers import SentenceTransformer
# import hashlib
# from typing import Dict, List, Any, Tuple
# from dataclasses import dataclass
# import logging

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# @dataclass
# class NodeFeatures:
#     """Data class to hold extracted features from a Figma node"""
#     structural_features: Dict[str, Any]
#     style_features: Dict[str, Any]
#     content_features: Dict[str, Any]
#     semantic_features: List[float]

# class FigmaNodeSimilarityDetector:
#     """
#     A flexible similarity detection system for Figma nodes using multiple ML approaches.
#     Designed to be easily extensible with different similarity algorithms.
#     """
    
#     def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
#         """
#         Initialize the similarity detector
        
#         Args:
#             similarity_threshold: Threshold for considering nodes similar (0-1)
#             use_semantic_embeddings: Whether to use semantic embeddings for text content
#         """
#         self.similarity_threshold = similarity_threshold
#         self.use_semantic_embeddings = use_semantic_embeddings
#         self.node_features = {}
#         self.similarity_matrix = None
#         self.clusters = None
        
#         # Initialize semantic model if needed
#         if use_semantic_embeddings:
#             try:
#                 self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
#                 logger.info("Loaded semantic embedding model")
#             except Exception as e:
#                 logger.warning(f"Could not load semantic model: {e}")
#                 self.use_semantic_embeddings = False
    
#     def extract_node_features(self, node_data: Dict[str, Any], node_path: str = "") -> NodeFeatures:
#         """
#         Extract comprehensive features from a Figma node
        
#         Args:
#             node_data: The node data dictionary
#             node_path: Path to the node in the tree
            
#         Returns:
#             NodeFeatures object containing all extracted features
#         """
#         node_info = node_data.get('node', {})
        
#         # 1. Structural Features
#         structural_features = {
#             'type': node_info.get('type', ''),
#             'tag': node_data.get('tag', ''),
#             'has_children': len(node_data.get('children', [])) > 0,
#             'num_children': len(node_data.get('children', [])),
#             # 'width': node_info.get('width', 0),
#             # 'height': node_info.get('height', 0),
#             # 'x': node_info.get('x', 0),
#             # 'y': node_info.get('y', 0),
#         }
        
#         # 2. Style Features
#         style_features = {
#             'font_family': node_info.get('fontName', {}).get('family', ''),
#             'font_style': node_info.get('fontName', {}).get('style', ''),
#             'font_size': node_info.get('fontSize', 0),
#             'flex_direction': node_info.get('flexDirection', ''),
#             'stroke_weight': node_info.get('StrokeWeight', 0),
#             'border_radius': {
#                 'top_left': node_info.get('topLeftRadius', 0),
#                 'top_right': node_info.get('topRightRadius', 0),
#                 'bottom_left': node_info.get('bottomLeftRadius', 0),
#                 'bottom_right': node_info.get('bottomRightRadius', 0),
#             }
#         }
        
#         # Extract fill colors
#         fills = node_info.get('fills', [])
#         if fills:
#             primary_fill = fills[0]
#             color = primary_fill.get('color', {})
#             style_features.update({
#                 'fill_type': primary_fill.get('type', ''),
#                 'fill_color_r': color.get('r', 0),
#                 'fill_color_g': color.get('g', 0),
#                 'fill_color_b': color.get('b', 0),
#                 'fill_color_a': color.get('a', 1),
#             })
        
#         # Extract stroke colors
#         strokes = node_info.get('strokes', [])
#         if strokes:
#             primary_stroke = strokes[0]
#             stroke_color = primary_stroke.get('color', {})
#             style_features.update({
#                 'stroke_type': primary_stroke.get('type', ''),
#                 'stroke_color_r': stroke_color.get('r', 0),
#                 'stroke_color_g': stroke_color.get('g', 0),
#                 'stroke_color_b': stroke_color.get('b', 0),
#                 'stroke_color_a': stroke_color.get('a', 1),
#             })
        
#         # 3. Content Features
#         content_features = {
#             'text_content': node_info.get('characters', ''),
#             'name': node_data.get('name', ''),
#             'has_text': bool(node_info.get('characters', '')),
#             # 'text_length': len(node_info.get('characters', '')),
#         }
        
#         # 4. Semantic Features (using embeddings)
#         semantic_features = []
#         if self.use_semantic_embeddings:
#             text_for_embedding = f"{content_features['name']} {content_features['text_content']} {structural_features['tag']} {structural_features['type']}"
#             if text_for_embedding.strip():
#                 try:
#                     semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
#                 except:
#                     semantic_features = [0.0] * 384  # Default embedding size
        
#         return NodeFeatures(
#             structural_features=structural_features,
#             style_features=style_features,
#             content_features=content_features,
#             semantic_features=semantic_features
#         )
    
#     def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
#         """
#         Build similarity matrix between all nodes.
#         This method can be easily modified to try different similarity approaches.
        
#         Args:
#             nodes_data: Dictionary containing all node data
            
#         Returns:
#             Similarity matrix as numpy array
#         """
#         # Extract features for all nodes
#         all_nodes = self._flatten_nodes(nodes_data)
#         self.node_features = {}
        
#         for i, (node_path, node_data) in enumerate(all_nodes):
#             features = self.extract_node_features(node_data, node_path)
#             self.node_features[node_path] = features
        
#         # Create feature vectors for similarity calculation
#         feature_vectors = []
#         self.node_paths = list(self.node_features.keys())
        
#         for node_path in self.node_paths:
#             features = self.node_features[node_path]
#             vector = self._create_feature_vector(features)
#             feature_vectors.append(vector)
        
#         feature_vectors = np.array(feature_vectors)
        
#         # Calculate similarity matrix using cosine similarity
#         self.similarity_matrix = cosine_similarity(feature_vectors)
        
#         return self.similarity_matrix
    
#     def _create_feature_vector(self, features: NodeFeatures) -> np.ndarray:
#         """
#         Create a numerical feature vector from NodeFeatures.
#         This method can be modified to emphasize different aspects.
        
#         Args:
#             features: NodeFeatures object
            
#         Returns:
#             Numerical feature vector
#         """
#         vector = []
        
#         # Structural features (normalized)
#         struct = features.structural_features
#         vector.extend([
#             hash(struct['type']) % 1000 / 1000.0,  # Normalize hash
#             hash(struct['tag']) % 1000 / 1000.0,
#             float(struct['has_children']),
#             min(struct['num_children'] / 10.0, 1.0),  # Normalize to 0-1
#             # min(struct['width'] / 1000.0, 1.0),
#             # min(struct['height'] / 1000.0, 1.0),
#         ])
        
#         # Style features
#         style = features.style_features
#         vector.extend([
#             hash(style['font_family']) % 1000 / 1000.0 if style['font_family'] else 0,
#             min(style['font_size'] / 100.0, 1.0) if style['font_size'] else 0,
#             style.get('fill_color_r', 0),
#             style.get('fill_color_g', 0),
#             style.get('fill_color_b', 0),
#             style.get('fill_color_a', 1),
#         ])
        
#         # Content features
#         content = features.content_features
#         vector.extend([
#             float(content['has_text']),
#             # min(content['text_length'] / 100.0, 1.0),
#         ])
        
#         # Semantic features (if available)
#         if features.semantic_features:
#             # Use first 20 dimensions to keep vector manageable
#             vector.extend(features.semantic_features[:384])
#         else:
#             vector.extend([0.0] * 384)
        
#         return np.array(vector)
    
#     def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
#         """
#         Check similarity between nodes and return groups of similar nodes.
#         This method can be easily modified to use different clustering approaches.
        
#         Args:
#             threshold: Similarity threshold (uses instance threshold if None)
            
#         Returns:
#             Dictionary mapping group_id to list of similar node paths
#         """
#         if self.similarity_matrix is None:
#             raise ValueError("Must build similarity matrix first")
        
#         threshold = threshold or self.similarity_threshold
        
#         # Method 1: Simple threshold-based grouping
#         similarity_groups = self._threshold_based_grouping(threshold)
        
#         # Method 2: DBSCAN clustering (alternative approach)
#         # similarity_groups = self._dbscan_clustering()
        
#         return similarity_groups
    
#     def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
#         """Threshold-based similarity grouping"""
#         groups = {}
#         assigned = set()
#         group_counter = 0
        
#         for i in range(len(self.node_paths)):
#             if self.node_paths[i] in assigned:
#                 continue
                
#             # Find all nodes similar to current node
#             similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
#             similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
#             if similar_nodes:  # If we found similar nodes
#                 group_id = f"group_{group_counter}"
#                 groups[group_id] = [self.node_paths[i]] + similar_nodes
#                 assigned.update(groups[group_id])
#                 group_counter += 1
        
#         return groups
    
#     def _dbscan_clustering(self, eps: float = 0.3, min_samples: int = 2) -> Dict[str, List[str]]:
#         """DBSCAN-based clustering (alternative approach)"""
#         # Convert similarity to distance matrix
#         distance_matrix = 1 - self.similarity_matrix
        
#         clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
#         cluster_labels = clustering.fit_predict(distance_matrix)
        
#         groups = {}
#         for i, label in enumerate(cluster_labels):
#             if label != -1:  # -1 is noise in DBSCAN
#                 group_id = f"cluster_{label}"
#                 if group_id not in groups:
#                     groups[group_id] = []
#                 groups[group_id].append(self.node_paths[i])
        
#         return groups
    
#     def _flatten_nodes(self, data: Dict[str, Any], path: str = "") -> List[Tuple[str, Dict[str, Any]]]:
#         """Recursively flatten the node tree into a list"""
#         nodes = []
        
#         if isinstance(data, dict):
#             if 'node' in data:  # This is a node
#                 current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                 nodes.append((current_path, data))
            
#             if 'children' in data:
#                 for i, child in enumerate(data['children']):
#                     child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                     nodes.extend(self._flatten_nodes(child, child_path))
        
#         return nodes
    
#     def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
#         """
#         Add node_id to each node in the original JSON based on similarity groups
        
#         Args:
#             original_data: Original JSON data
#             similarity_groups: Groups of similar nodes
            
#         Returns:
#             Modified JSON with node_id added to each node
#         """
#         # Create mapping from node path to group id
#         path_to_group = {}
#         for group_id, node_paths in similarity_groups.items():
#             for node_path in node_paths:
#                 path_to_group[node_path] = group_id
        
#         # Add node_ids recursively
#         modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
#         return modified_data
    
#     def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
#         """Recursively add node_ids to the JSON structure"""
#         if isinstance(data, dict):
#             result = data.copy()
            
#             if 'node' in data:  # This is a node
#                 current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
#                 # Add node_id if this node is in a similarity group
#                 if current_path in path_to_group:
#                     result['node_id'] = path_to_group[current_path]
#                 else:
#                     # Generate unique ID for non-grouped nodes
#                     result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
#             if 'children' in data:
#                 child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
#                 result['children'] = [
#                     self._add_node_ids_recursive(child, path_to_group, child_path)
#                     for child in data['children']
#                 ]
            
#             return result
        
#         return data

# def main():
#     """Example usage of the FigmaNodeSimilarityDetector"""
    
#     # Example JSON data (your provided data)
#     sample_data = {
#         "children": [
#             {
#                 "children": [
#                     {
#                         "children": [
#                             {
#                                 "children": [],
#                                 "name": "ICON",
#                                 "node": {
#                                     "type": "ELLIPSE",
#                                     "width": 32.0,
#                                     "height": 32.0,
#                                     "fills": [{"type": "SOLID", "color": {"r": 0.77, "g": 0.77, "b": 0.77, "a": 1.0}}]
#                                 },
#                                 "tag": "ICON"
#                             }
#                         ]
#                     }
#                 ]
#             }
#         ]
#     }
    
#     # Initialize detector
#     detector = FigmaNodeSimilarityDetector(
#         similarity_threshold=0.8,
#         use_semantic_embeddings=True
#     )
    
#     try:
#         # Build similarity matrix
#         logger.info("Building similarity matrix...")
#         similarity_matrix = detector.build_similarity_matrix(sample_data)
#         logger.info(f"Similarity matrix shape: {similarity_matrix.shape}")
        
#         # Check for similar nodes
#         logger.info("Checking for similar nodes...")
#         similarity_groups = detector.check_similarity()
#         logger.info(f"Found {len(similarity_groups)} similarity groups")
        
#         # Add node IDs to original JSON
#         logger.info("Adding node IDs to JSON...")
#         result_json = detector.add_node_ids_to_json(sample_data, similarity_groups)
        
#         # Print results
#         print("Similarity Groups:")
#         for group_id, nodes in similarity_groups.items():
#             print(f"  {group_id}: {nodes}")
        
#         print("\nJSON with node_ids:")
#         print(json.dumps(result_json, indent=2))
        
#     except Exception as e:
#         logger.error(f"Error in main execution: {e}")
#         # Fallback without semantic embeddings
#         detector_simple = FigmaNodeSimilarityDetector(
#             similarity_threshold=0.8,
#             use_semantic_embeddings=False
#         )
#         similarity_matrix = detector_simple.build_similarity_matrix(sample_data)
#         similarity_groups = detector_simple.check_similarity()
#         result_json = detector_simple.add_node_ids_to_json(sample_data, similarity_groups)
#         print(json.dumps(result_json, indent=2))

# if __name__ == "__main__":
#     main()

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import hashlib
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FigmaNodeSimilarityDetector:
    """
    A flexible similarity detection system for Figma nodes using bottom-up approach.
    Feature vectors are extracted directly from leaf nodes and propagated upward.
    """
    
    def __init__(self, similarity_threshold: float = 0.8, use_semantic_embeddings: bool = True):
        """
        Initialize the similarity detector
        
        Args:
            similarity_threshold: Threshold for considering nodes similar (0-1)
            use_semantic_embeddings: Whether to use semantic embeddings for text content
        """
        self.similarity_threshold = similarity_threshold
        self.use_semantic_embeddings = use_semantic_embeddings
        self.node_feature_vectors = {}  # Store feature vectors directly
        self.node_metadata = {}  # Store metadata (is_leaf, etc.)
        self.similarity_matrix = None
        self.clusters = None
        self.node_tree = {}  # Store hierarchical structure
        
        # Initialize semantic model if needed
        if use_semantic_embeddings:
            try:
                self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
                logger.info("Loaded semantic embedding model")
            except Exception as e:
                logger.warning(f"Could not load semantic model: {e}")
                self.use_semantic_embeddings = False
    
    def extract_leaf_node_feature_vector(self, node_data: Dict[str, Any], node_path: str = "") -> np.ndarray:
        """
        Extract feature vector directly from a leaf Figma node

        Args:
            node_data: The node data dictionary
            node_path: Path to the node in the tree

        Returns:
            Numpy array containing the feature vector
        """
        node_info = node_data.get('node', {})
        vector = []

        # 1. Structural Features (normalized)
        node_type = node_info.get('type', '')
        tag = node_data.get('tag', '')
        has_children = len(node_data.get('children', [])) > 0
        num_children = len(node_data.get('children', []))

        vector.extend([
            hash(node_type) % 1000 / 1000.0,  # Normalize hash
            # hash(tag) % 1000 / 1000.0,
            float(has_children),
            min(num_children / 10000.0, 1.0),  # Normalize to 0-1
        ])

        # 2. Style Features
        # Extract font information from textStyle
        text_style = node_info.get('textStyle', {})
        font_family = text_style.get('fontFamily', '')
        font_size = text_style.get('fontSize', 0)
        font_style = text_style.get('fontStyle', '')
        font_weight = text_style.get('fontWeight', 0)

        # Extract fill colors
        fills = node_info.get('fills', [])
        fill_r = fill_g = fill_b = fill_a = 0
        if fills:
            primary_fill = fills[0]
            color = primary_fill.get('color', {})
            fill_r = color.get('r', 0)
            fill_g = color.get('g', 0)
            fill_b = color.get('b', 0)
            fill_a = color.get('a', 1)

        # Extract stroke colors
        strokes = node_info.get('strokes', [])
        stroke_r = stroke_g = stroke_b = stroke_a = 0
        if strokes:
            primary_stroke = strokes[0]
            stroke_color = primary_stroke.get('color', {})
            stroke_r = stroke_color.get('r', 0)
            stroke_g = stroke_color.get('g', 0)
            stroke_b = stroke_color.get('b', 0)
            stroke_a = stroke_color.get('a', 1)

        vector.extend([
            hash(font_family) % 1000 / 1000.0 if font_family else 0,
            min(font_size / 10000.0, 1.0) if font_size else 0,  # Normalize font size to 0-1 (assuming max ~100px)
            hash(font_style) % 1000 / 1000.0 if font_style else 0,
            min(font_weight / 10000.0, 1.0) if font_weight else 0,  # Normalize font weight (max ~900-1000)
            fill_r, fill_g, fill_b, fill_a,
            stroke_r, stroke_g, stroke_b, stroke_a,
        ])

        # 3. Content Features
        text_content = node_info.get('characters', '')
        name = node_data.get('name', '')
        has_text = bool(text_content)

        vector.extend([
            float(has_text),
        ])

        # 4. Semantic Features (using embeddings) - commented out for now
        # if self.use_semantic_embeddings:
        #     text_for_embedding = f"{name} {node_type}"
        #     if text_for_embedding.strip():
        #         try:
        #             semantic_features = self.semantic_model.encode([text_for_embedding])[0].tolist()
        #             vector.extend(semantic_features)
        #         except:
        #             vector.extend([0.0] * 384)  # Default embedding size
        #     else:
        #         vector.extend([0.0] * 384)
        # else:
        #     vector.extend([0.0] * 384)

        # 5. Descendant Count Feature (for leaf nodes, this is 0)
        total_descendants = 0  # Leaf nodes have no descendants
        vector.extend([
            total_descendants
        ])

        return np.array(vector)

    def create_parent_feature_vector(self, children_vectors: List[np.ndarray], node_data: Dict[str, Any]) -> np.ndarray:
        """
        Create parent node feature vector by averaging children feature vectors

        Args:
            children_vectors: List of children feature vectors
            node_data: Parent node data

        Returns:
            Feature vector for parent node
        """
        if not children_vectors:
            # If no children vectors, create a default vector
            default_size = 0 + 17  # Updated size: structural(3) + style(12) + content(1) + descendant(1)
            return np.zeros(default_size)

        # Average all children feature vectors
        parent_vector = np.mean(children_vectors, axis=0)

        # Override some structural features specific to the parent
        node_info = node_data.get('node', {})
        node_type = node_info.get('type', '')
        tag = node_data.get('tag', '')
        num_children = len(children_vectors)

        # Update structural features (first 3 elements)
        parent_vector[0] = hash(node_type) % 1000 / 1000.0
        # parent_vector[1] = hash(tag) % 1000 / 1000.0
        parent_vector[1] = 1.0  # Parent always has children
        parent_vector[2] = min(num_children / 10000.0, 1.0)

        # Calculate total descendants
        total_descendants = num_children
        for child_vector in children_vectors:
            # The descendant count is the last feature in the vector
            child_descendants = int(child_vector[-1] * 10000)  # Denormalize
            total_descendants += child_descendants

        # Update the descendant count feature (last element)
        parent_vector[-1] = min(total_descendants / 10000.0, 1.0)

        return parent_vector

    def get_total_descendants_count(self, node_vector: np.ndarray) -> int:
        """
        Extract the total descendants count from a node's feature vector

        Args:
            node_vector: The feature vector of a node

        Returns:
            Total number of descendants for this node
        """
        # The descendant count is the last feature, denormalized
        return int(node_vector[-1] * 10000)
    
    def build_similarity_matrix(self, nodes_data: Dict[str, Any]) -> np.ndarray:
        """
        Build similarity matrix using bottom-up approach.
        Feature vectors are extracted from leaves and propagated upward.
        
        Args:
            nodes_data: Dictionary containing all node data
            
        Returns:
            Similarity matrix as numpy array
        """
        # Step 1: Build the tree structure and identify leaf nodes
        self.node_tree = self._build_node_tree(nodes_data)
        
        # Step 2: Extract feature vectors bottom-up
        self.node_feature_vectors = {}
        self.node_metadata = {}
        self._extract_feature_vectors_bottom_up(nodes_data)
        
        # Step 3: Filter out leaf nodes for similarity calculation (only compare non-leaf nodes)
        non_leaf_paths = [path for path, metadata in self.node_metadata.items() if not metadata['is_leaf']]
        self.node_paths = non_leaf_paths
        
        if not self.node_paths:
            logger.warning("No non-leaf nodes found for similarity comparison")
            return np.array([])
        
        # Step 4: Get feature vectors for non-leaf nodes only
        feature_vectors = []
        for node_path in self.node_paths:
            vector = self.node_feature_vectors[node_path]
            feature_vectors.append(vector)
        
        feature_vectors = np.array(feature_vectors)
        
        # Step 5: Calculate similarity matrix using cosine similarity
        self.similarity_matrix = cosine_similarity(feature_vectors)
        
        return self.similarity_matrix
    
    def _build_node_tree(self, data: Dict[str, Any], path: str = "", parent_path: str = None) -> Dict[str, Dict]:
        """Build a tree structure mapping node paths to their metadata"""
        tree = {}
        
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                tree[current_path] = {
                    'data': data,
                    'parent': parent_path,
                    'children': [],
                    'is_leaf': len(data.get('children', [])) == 0
                }
                
                if 'children' in data:
                    for child in data['children']:
                        child_tree = self._build_node_tree(child, current_path, current_path)
                        tree.update(child_tree)
                        # Add child paths to current node
                        for child_path in child_tree.keys():
                            if child_tree[child_path]['parent'] == current_path:
                                tree[current_path]['children'].append(child_path)
        
        return tree
    
    def _extract_feature_vectors_bottom_up(self, data: Dict[str, Any], path: str = ""):
        """Extract feature vectors using bottom-up approach"""
        if isinstance(data, dict):
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # First, process all children
                children_vectors = []
                if 'children' in data:
                    for child in data['children']:
                        child_path = f"{current_path}"
                        self._extract_feature_vectors_bottom_up(child, child_path)
                        
                        # Get child path and vector
                        child_node_path = f"{current_path}/{child.get('name', 'unnamed')}"
                        if child_node_path in self.node_feature_vectors:
                            children_vectors.append(self.node_feature_vectors[child_node_path])
                
                # Extract feature vector for current node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:  # Leaf node
                    self.node_feature_vectors[current_path] = self.extract_leaf_node_feature_vector(data, current_path)
                else:  # Parent node - aggregate children vectors
                    self.node_feature_vectors[current_path] = self.create_parent_feature_vector(children_vectors, data)
                
                # Store metadata
                self.node_metadata[current_path] = {
                    'is_leaf': is_leaf,
                    'num_children': len(children_vectors),
                    'node_type': data.get('node', {}).get('type', ''),
                    'name': data.get('name', '')
                }
    
    def check_similarity(self, threshold: float = None) -> Dict[str, List[str]]:
        """
        Check similarity between non-leaf nodes and return groups of similar nodes.
        Leaf nodes are excluded from grouping.
        
        Args:
            threshold: Similarity threshold (uses instance threshold if None)
            
        Returns:
            Dictionary mapping group_id to list of similar node paths (non-leaf nodes only)
        """
        if self.similarity_matrix is None:
            raise ValueError("Must build similarity matrix first")
        
        if len(self.similarity_matrix) == 0:
            logger.warning("No non-leaf nodes available for similarity comparison")
            return {}
        
        threshold = threshold or self.similarity_threshold
        
        # Method 1: Simple threshold-based grouping (only for non-leaf nodes)
        similarity_groups = self._threshold_based_grouping(threshold)
        
        return similarity_groups
    
    def _threshold_based_grouping(self, threshold: float) -> Dict[str, List[str]]:
        """Threshold-based similarity grouping for non-leaf nodes only"""
        groups = {}
        assigned = set()
        group_counter = 0
        
        for i in range(len(self.node_paths)):
            if self.node_paths[i] in assigned:
                continue
                
            # Find all nodes similar to current node
            similar_indices = np.where(self.similarity_matrix[i] >= threshold)[0]
            similar_nodes = [self.node_paths[j] for j in similar_indices if j != i]
            
            if similar_nodes:  # If we found similar nodes
                group_id = f"group_{group_counter}"
                groups[group_id] = [self.node_paths[i]] + similar_nodes
                assigned.update(groups[group_id])
                group_counter += 1
        
        return groups
    
    def _dbscan_clustering(self, eps: float = 0.3, min_samples: int = 2) -> Dict[str, List[str]]:
        """DBSCAN-based clustering (alternative approach) for non-leaf nodes only"""
        # Convert similarity to distance matrix
        distance_matrix = 1 - self.similarity_matrix
        
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        cluster_labels = clustering.fit_predict(distance_matrix)
        
        groups = {}
        for i, label in enumerate(cluster_labels):
            if label != -1:  # -1 is noise in DBSCAN
                group_id = f"cluster_{label}"
                if group_id not in groups:
                    groups[group_id] = []
                groups[group_id].append(self.node_paths[i])
        
        return groups
    
    def add_node_ids_to_json(self, original_data: Dict[str, Any], similarity_groups: Dict[str, List[str]]) -> Dict[str, Any]:
        """
        Add node_id to each node in the original JSON based on similarity groups.
        Only non-leaf nodes get group IDs, leaf nodes get unique IDs.
        
        Args:
            original_data: Original JSON data
            similarity_groups: Groups of similar nodes (non-leaf only)
            
        Returns:
            Modified JSON with node_id added to each node
        """
        # Create mapping from node path to group id
        path_to_group = {}
        for group_id, node_paths in similarity_groups.items():
            for node_path in node_paths:
                path_to_group[node_path] = group_id
        
        # Add node_ids recursively
        modified_data = self._add_node_ids_recursive(original_data, path_to_group)
        
        return modified_data
    
    def _add_node_ids_recursive(self, data: Dict[str, Any], path_to_group: Dict[str, str], path: str = "") -> Dict[str, Any]:
        """Recursively add node_ids to the JSON structure"""
        if isinstance(data, dict):
            result = data.copy()
            
            if 'node' in data:  # This is a node
                current_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                
                # Check if this is a leaf node
                is_leaf = len(data.get('children', [])) == 0
                
                if is_leaf:
                    # Leaf nodes get unique IDs
                    result['node_id'] = f"leaf_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
                else:
                    # Non-leaf nodes get group IDs if they're in a similarity group
                    if current_path in path_to_group:
                        result['node_id'] = path_to_group[current_path]
                    else:
                        # Generate unique ID for non-grouped non-leaf nodes
                        result['node_id'] = f"unique_{hashlib.md5(current_path.encode()).hexdigest()[:8]}"
            
            if 'children' in data:
                child_path = f"{path}/{data.get('name', 'unnamed')}" if path else data.get('name', 'root')
                result['children'] = [
                    self._add_node_ids_recursive(child, path_to_group, child_path)
                    for child in data['children']
                ]
            
            return result
        
        return data

    def get_feature_vector_info(self) -> Dict[str, Any]:
        """
        Get information about the feature vectors for debugging/analysis
        
        Returns:
            Dictionary containing feature vector statistics
        """
        if not self.node_feature_vectors:
            return {"error": "No feature vectors extracted yet"}
        
        vector_lengths = [len(v) for v in self.node_feature_vectors.values()]
        leaf_count = sum(1 for metadata in self.node_metadata.values() if metadata['is_leaf'])
        non_leaf_count = len(self.node_metadata) - leaf_count
        
        return {
            "total_nodes": len(self.node_feature_vectors),
            "leaf_nodes": leaf_count,
            "non_leaf_nodes": non_leaf_count,
            "feature_vector_length": vector_lengths[0] if vector_lengths else 0,
            "feature_breakdown": {
                "structural_features": 3,
                "style_features": 12,  # Updated: font_family, font_size, font_style, font_weight + colors
                "content_features": 1,
                "descendant_count": 1,
                "semantic_features": 384 if self.use_semantic_embeddings else 0
            }
        }

    def print_figma_tree_with_vectors(self, node, depth=0, path=""):
        """
        Print the Figma node tree with feature vectors
        
        Args:
            node: The current node to print
            depth: Current depth in the tree
            path: Current path to the node
        """
        indent = "  " * depth  # 2 spaces per level

        # Extract info
        name = node.get("name", "[no name]")
        tag = node.get("tag", "[no tag]")
        node_id = node.get("node_id", "")
        
        # Handle TEXT nodes with characters
        node_data = node.get("node", {})
        characters = node_data.get("characters", "")
        is_text = tag == "TEXT"
        display_name = characters[:10] + "..." if is_text and characters else name

        # Layout info (if present)
        layout = node_data.get("layoutMode", "NONE")
        layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

        # Get the current node path
        current_path = f"{path}/{name}" if path else name
        
        # Get feature vector if available
        vector_str = ""
        if current_path in self.node_feature_vectors:
            vector = self.node_feature_vectors[current_path]
            # Format vector to show first few values and some key features
        if len(vector) > 0:
            vector_str = f" | Vector: [{', '.join(f'{v:.7f}' for v in vector)}] (len={len(vector)})"
        else:
            vector_str = " | Vector: Not found"

        # Print current node info with feature vector
        print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id}){vector_str}")

        # Recursively print children
        for child in node.get("children", []):
            self.print_figma_tree_with_vectors(child, depth + 1, current_path)

    def print_detailed_vector_info(self, node_path: str):
        """
        Print detailed breakdown of a specific node's feature vector
        
        Args:
            node_path: Path to the node whose vector to analyze
        """
        if node_path not in self.node_feature_vectors:
            print(f"No feature vector found for path: {node_path}")
            return
        
        vector = self.node_feature_vectors[node_path]
        metadata = self.node_metadata.get(node_path, {})
        
        print(f"\nDetailed vector breakdown for: {node_path}")
        print(f"Node type: {metadata.get('node_type', 'unknown')}")
        print(f"Is leaf: {metadata.get('is_leaf', 'unknown')}")
        print(f"Children count: {metadata.get('num_children', 'unknown')}")
        print(f"Vector length: {len(vector)}")
        
        # Break down the vector components
        print("\nFeature breakdown:")
        idx = 0
        
        # Structural features (3)
        print(f"  Structural features [0-2]:")
        print(f"    Node type hash: {vector[idx]:.6f}")
        idx += 1
        print(f"    Has children: {vector[idx]:.6f}")
        idx += 1
        print(f"    Children count (norm): {vector[idx]:.6f} (actual: {int(vector[idx] * 10000)})")
        idx += 1
        
        # Style features (12)
        print(f"  Style features [3-14]:")
        print(f"    Font family hash: {vector[idx]:.6f}")
        idx += 1
        print(f"    Font size (norm): {vector[idx]:.6f} (actual: {int(vector[idx] * 10000)}px)")
        idx += 1
        print(f"    Font style hash: {vector[idx]:.6f}")
        idx += 1
        print(f"    Font weight (norm): {vector[idx]:.6f} (actual: {int(vector[idx] * 10000)})")
        idx += 1
        print(f"    Fill RGBA: ({vector[idx]:.3f}, {vector[idx+1]:.3f}, {vector[idx+2]:.3f}, {vector[idx+3]:.3f})")
        idx += 4
        print(f"    Stroke RGBA: ({vector[idx]:.3f}, {vector[idx+1]:.3f}, {vector[idx+2]:.3f}, {vector[idx+3]:.3f})")
        idx += 4
        
        # Content features (1)
        print(f"  Content features [15]:")
        print(f"    Has text: {vector[idx]:.6f}")
        idx += 1
        
        # Descendant count (1)
        print(f"  Descendant count [16]:")
        print(f"    Total descendants (norm): {vector[idx]:.6f} (actual: {int(vector[idx] * 10000)})")
        idx += 1
        
        if self.use_semantic_embeddings and len(vector) > 17:
            print(f"  Semantic features [17-{len(vector)-1}]: {len(vector) - 17} dimensions")
            print(f"    First 5 semantic values: {vector[17:22]}")
            print(f"    Last 5 semantic values: {vector[-5:]}")

  from .autonotebook import tqdm as notebook_tqdm





In [59]:
import json

def rename_names_and_store_mapping(node, counter, mapping):
    if isinstance(node, dict):
        if "name" in node:
            old_name = node["name"]
            new_name = f"name{counter[0]}"
            mapping[new_name] = old_name
            node["name"] = new_name
            counter[0] += 1
        for key in node:
            rename_names_and_store_mapping(node[key], counter, mapping)
    elif isinstance(node, list):
        for item in node:
            rename_names_and_store_mapping(item, counter, mapping)

# Load input JSON
with open("PAGE_115.json", "r") as f:
    data = json.load(f)

# Rename names and save mapping
name_mapping = {}
rename_names_and_store_mapping(data, [0], name_mapping)

# Save modified JSON and mapping
with open("modified.json", "w") as f:
    json.dump(data, f, indent=2)

with open("name_mapping.json", "w") as f:
    json.dump(name_mapping, f, indent=2)


In [71]:
# Initialize the detector
detector = FigmaNodeSimilarityDetector(
    similarity_threshold=0.99999999999,
    use_semantic_embeddings=True
)

# Load your JSON data
with open('modified.json', 'r') as f:
    figma_data = json.load(f)

# Build similarity matrix
similarity_matrix = detector.build_similarity_matrix(figma_data)

# Find similar nodes
similarity_groups = detector.check_similarity()
print(similarity_groups)
# Add node_ids to original JSON
result_json = detector.add_node_ids_to_json(figma_data, similarity_groups)


with open("modified.json", "w") as f:
    json.dump(result_json, f, indent=2)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Loaded semantic embedding model


{'group_0': ['name0/name1/name2/name3/name4', 'name0/name1/name2/name3/name6', 'name0/name1/name2/name8/name11/name12', 'name0/name1/name2/name8/name11/name14', 'name0/name1/name2/name8/name11/name16'], 'group_1': ['name0/name1/name18/name22/name23', 'name0/name1/name32/name33/name35/name36/name39/name40/name41', 'name0/name1/name32/name52/name53/name54/name57/name58/name59', 'name0/name1/name32/name71/name73/name74/name77/name78/name79', 'name0/name1/name32/name90/name91/name92/name95/name96/name97'], 'group_2': ['name0/name1/name18/name22', 'name0/name1/name32/name33/name35/name36/name39/name40', 'name0/name1/name32/name52/name53/name54/name57/name58', 'name0/name1/name32/name71/name73/name74/name77/name78', 'name0/name1/name32/name90/name91/name92/name95/name96'], 'group_3': ['name0/name1/name18/name26', 'name0/name1/name32/name33/name35/name36/name39/name45', 'name0/name1/name32/name52/name53/name54/name57/name63', 'name0/name1/name32/name71/name73/name74/name77/name83', 'name0/nam

In [72]:
def restore_names_from_mapping(node, mapping):
    if isinstance(node, dict):
        if "name" in node and node["name"] in mapping:
            node["name"] = mapping[node["name"]]
        for key in node:
            restore_names_from_mapping(node[key], mapping)
    elif isinstance(node, list):
        for item in node:
            restore_names_from_mapping(item, mapping)

# Load modified JSON and name mapping
with open("modified.json", "r") as f:
    modified_data = json.load(f)

with open("name_mapping.json", "r") as f:
    name_mapping = json.load(f)

# Restore original names
restore_names_from_mapping(modified_data, name_mapping)

# Save restored JSON
with open("restored.json", "w") as f:
    json.dump(modified_data, f, indent=2)


In [75]:
detector.print_figma_tree_with_vectors(result_json)

- name0 [DIV] -> name0 NONE (unique_ad6f6c09) | Vector: [0.0500000, 1.0000000, 0.0001000, 0.5193714, 0.0014182, 0.1343375, 0.0307381, 0.3121440, 0.3349568, 0.3656767, 0.8095238, 0.0151225, 0.0174615, 0.0195413, 0.0455357, 0.7095238, 0.0113000] (len=17)
  - name1 [DIV] -> name1 NONE (unique_917b8603) | Vector: [0.2020000, 1.0000000, 0.0005000, 0.5193714, 0.0014182, 0.1343375, 0.0307381, 0.3121440, 0.3349568, 0.3656767, 0.8095238, 0.0151225, 0.0174615, 0.0195413, 0.0455357, 0.7095238, 0.0112000] (len=17)
    - name2 [NAVBAR] -> name2 NONE (unique_a21c5b2f) | Vector: [0.2020000, 1.0000000, 0.0002000, 0.5490000, 0.0016500, 0.1320000, 0.0300000, 0.2666667, 0.3215686, 0.3725490, 1.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.7500000, 0.0014000] (len=17)
      - name3 [LIST] -> name3 NONE (unique_831dabf8) | Vector: [0.2020000, 1.0000000, 0.0002000, 0.7320000, 0.0022000, 0.1760000, 0.0400000, 0.2666667, 0.3215686, 0.3725490, 1.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 1.0

In [4]:
with open('PAGE_1_OUTPUT.json', 'w') as f:
    json.dump(result_json, f, indent=2)


In [42]:
print(json.dumps(result_json, indent=2))


{
  "children": [
    {
      "children": [
        {
          "children": [
            {
              "children": [
                {
                  "children": [
                    {
                      "children": [],
                      "name": "Travel",
                      "node": {
                        "StrokeWeight": 0.0,
                        "bottomLeftRadius": 0.0,
                        "bottomRightRadius": 0.0,
                        "characters": "Travel",
                        "fills": [
                          {
                            "blendMode": "NORMAL",
                            "color": {
                              "a": 1.0,
                              "b": 0.37254902720451355,
                              "g": 0.32156863808631897,
                              "r": 0.2666666805744171
                            },
                            "imageRef": "",
                            "type": "SOLID"
                          }


In [76]:
def print_figma_node(node, depth=0):
    indent = "  " * depth  # 2 spaces per level

    # Extract info
    name = node.get("name", "[no name]")
    tag = node.get("tag", "[no tag]")
    node_id = node.get("node_id", "")
    
    # Handle TEXT nodes with characters
    node_data = node.get("node", {})
    characters = node_data.get("characters", "")
    is_text = tag == "TEXT"
    display_name = characters[:10] + "..." if is_text and characters else name

    # Layout info (if present)
    layout = node_data.get("layoutMode", "NONE")
    layout_str = "ROWS" if layout == "HORIZONTAL" else "COLS" if layout == "VERTICAL" else layout

    # Print current node info
    print(f"{indent}- {display_name} [{tag}] -> {name} {layout_str} ({node_id})")

    # Recursively print children
    for child in node.get("children", []):
        print_figma_node(child, depth + 1)



with open("restored.json", "r") as f:
    restored_data = json.load(f)
print_figma_node(restored_data)

- TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_ad6f6c09)
  - PAGE_115 [DIV] -> PAGE_115 NONE (unique_917b8603)
    - NAVBAR [NAVBAR] -> NAVBAR NONE (unique_a21c5b2f)
      - LIST [LIST] -> LIST NONE (unique_831dabf8)
        - LI [LI] -> LI NONE (group_0)
          - Travel [P] -> Travel NONE (leaf_1d6736ed)
        - LI [LI] -> LI NONE (group_0)
          - Travel [P] -> Travel NONE (leaf_538018bc)
      - TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_4145b98c)
        - ICON [ICON] -> ICON NONE (unique_a9ddecc4)
          - Vector [DIV] -> Vector NONE (leaf_77e8672a)
        - LIST [LIST] -> LIST NONE (unique_6aefeabf)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_9134004b)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_54c1e5d7)
          - LI [LI] -> LI NONE (group_0)
            - A [A] -> A NONE (leaf_89f588e2)
    - TREE BUILDER GROUP [DIV] -> TREE BUILDER GROUP NONE (unique_c3ea6ea6)
     