# LDA-Based Taxonomy Tree for n8n Node Schemas

這個 notebook 針對 n8n node schemas 進行 LDA 分析：
1. 解析 node_schemas/ 目錄中的 JSON 檔案
2. 先透過 LDA 偵測出不同的 topics
3. 輸出 topic, words, 對應的 node schema 檔
4. 利用這些 topics 建立 taxonomy tree
5. 專注於 n8n nodes 的功能分類

In [1]:
import sys
import os
from pathlib import Path
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns

# 設定專案路徑
project_root = Path().absolute()
node_schemas_path = project_root / "node_schemas"

print(f"Project root: {project_root}")
print(f"Working directory: {Path().absolute()}")
print(f"Node schemas path: {node_schemas_path}")

# 檢查 node_schemas 目錄是否存在
if not node_schemas_path.exists():
    print(f"Error: {node_schemas_path} does not exist!")
else:
    schema_files = list(node_schemas_path.glob("*.json"))
    print(f"Found {len(schema_files)} JSON schema files")

Project root: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent
Working directory: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent
Node schemas path: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent/node_schemas
Found 792 JSON schema files


In [2]:
class NodeSchemaParser:
    def __init__(self, schemas_dir):
        self.schemas_dir = Path(schemas_dir)
    
    def extract_text_from_json(self, data, exclude_keys=None):
        if exclude_keys is None:
            exclude_keys = {'type', 'required', 'default', 'noDataExpression'}
        
        texts = []
        
        if isinstance(data, dict):
            for key, value in data.items():
                if key in exclude_keys:
                    continue
                if isinstance(value, str) and len(value.strip()) > 1:
                    texts.append(value.strip())
                elif isinstance(value, (dict, list)):
                    texts.extend(self.extract_text_from_json(value, exclude_keys))
        elif isinstance(data, list):
            for item in data:
                texts.extend(self.extract_text_from_json(item, exclude_keys))
        elif isinstance(data, str) and len(data.strip()) > 1:
            texts.append(data.strip())
        
        return texts
    
    def parse_schema_file(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                schema_data = json.load(f)
            
            name = schema_data.get('name', file_path.stem)
            display_name = schema_data.get('displayName', '')
            description = schema_data.get('description', '')
            title = display_name if display_name else name
            
            all_texts = self.extract_text_from_json(schema_data)
            content = ' '.join(all_texts)
            
            return {
                'file_name': file_path.name,
                'file_path': str(file_path),
                'name': name,
                'display_name': display_name,
                'description': description,
                'title': title,
                'content': content,
                'raw_data': schema_data
            }
        
        except Exception as e:
            print(f"Error parsing {file_path}: {e}")
            return None
    
    def parse_all_schemas(self):
        schemas = {}
        schema_files = list(self.schemas_dir.glob("*.json"))
        
        for file_path in schema_files:
            schema_info = self.parse_schema_file(file_path)
            if schema_info:
                schemas[schema_info['file_name']] = schema_info
        
        return schemas

print("NodeSchemaParser class created successfully!")

NodeSchemaParser class created successfully!


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
import re

class TopicModeler:
    def __init__(self, n_topics=10, random_state=42):
        self.n_topics = n_topics
        self.random_state = random_state
        self.vectorizer = None
        self.lda_model = None
        self.feature_names = None
    
    def preprocess_text(self, text):
        text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
        text = text.lower()
        text = ' '.join(text.split())
        return text
    
    def fit_transform(self, documents):
        processed_docs = [self.preprocess_text(doc) for doc in documents]
        processed_docs = [doc for doc in processed_docs if len(doc.strip()) > 0]
        
        self.vectorizer = CountVectorizer(
            max_features=1000,
            min_df=2,
            max_df=0.8,
            stop_words='english',
            ngram_range=(1, 2)
        )
        
        doc_term_matrix = self.vectorizer.fit_transform(processed_docs)
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        self.lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            random_state=self.random_state,
            max_iter=10,
            learning_method='online'
        )
        
        doc_topic_probs = self.lda_model.fit_transform(doc_term_matrix)
        perplexity = self.lda_model.perplexity(doc_term_matrix)
        
        results = self._generate_results(doc_topic_probs)
        results['model_perplexity'] = perplexity
        
        return results
    
    def _generate_results(self, doc_topic_probs):
        results = {
            'topics': [],
            'document_topics': [],
            'topic_similarities': []
        }
        
        for topic_idx in range(self.n_topics):
            topic_words = []
            topic_weights = self.lda_model.components_[topic_idx]
            top_indices = topic_weights.argsort()[-10:][::-1]
            
            for idx in top_indices:
                topic_words.append({
                    'word': self.feature_names[idx],
                    'weight': float(topic_weights[idx])
                })
            
            top_words = [w['word'] for w in topic_words[:3]]
            label = ' + '.join(top_words).title()
            coherence = np.mean([w['weight'] for w in topic_words[:5]])
            
            results['topics'].append({
                'id': topic_idx,
                'label': label,
                'top_words': topic_words,
                'coherence': float(coherence)
            })
        
        for doc_idx, topic_probs in enumerate(doc_topic_probs):
            dominant_topic = int(np.argmax(topic_probs))
            dominant_prob = float(np.max(topic_probs))
            
            results['document_topics'].append({
                'document_index': doc_idx,
                'dominant_topic': dominant_topic,
                'dominant_topic_probability': dominant_prob,
                'all_topic_probabilities': topic_probs.tolist()
            })
        
        topic_similarities = np.corrcoef(self.lda_model.components_)
        results['topic_similarities'] = topic_similarities.tolist()
        
        return results

print("TopicModeler class created successfully!")

TopicModeler class created successfully!


## Step 1: 載入和解析 Node Schema 檔案

In [4]:
# 解析 Node Schema 檔案
parser = NodeSchemaParser(node_schemas_path)
schemas = parser.parse_all_schemas()

print(f"Successfully parsed {len(schemas)} schema files")
print(f"First few files: {list(schemas.keys())[:5]}")

# 顯示範例 schema 內容
if schemas:
    first_schema = next(iter(schemas.values()))
    print(f"\nExample schema: {first_schema['file_name']}")
    print(f"Display name: {first_schema['display_name']}")
    print(f"Description: {first_schema['description'][:100]}...")
    print(f"Content length: {len(first_schema['content'])} characters")
    print(f"Title: {first_schema['title']}")

Successfully parsed 792 schema files
First few files: ['onfleet.json', 'lemlistTool.json', 'chat.json', 'wait.json', 'crowdDevTool.json']

Example schema: onfleet.json
Display name: Onfleet
Description: Consume Onfleet API...
Content length: 22165 characters
Title: Onfleet


## Step 2: 準備文件內容進行 LDA 分析

In [5]:
# 準備文件內容
documents = []
file_mapping = []

for file_path, schema_data in schemas.items():
    content = schema_data["content"]
    if len(content.strip()) > 50:
        documents.append(content)
        file_mapping.append({
            "file_path": schema_data["file_path"],
            "file_name": schema_data["file_name"],
            "title": schema_data["title"],
            "display_name": schema_data["display_name"],
            "description": schema_data["description"]
        })

print(f"Prepared {len(documents)} documents for LDA analysis")

content_lengths = [len(doc) for doc in documents]
print(f"Content length - Min: {min(content_lengths)}, Max: {max(content_lengths)}, Mean: {np.mean(content_lengths):.0f}")

Prepared 792 documents for LDA analysis
Content length - Min: 131, Max: 136209, Mean: 6732


## Step 3: 執行 LDA Topic Modeling

In [6]:
# 執行 LDA
topic_modeler = TopicModeler()
lda_results = topic_modeler.fit_transform(documents)

print(f"LDA analysis completed!")
print(f"Generated {len(lda_results['topics'])} topics")
print(f"Model perplexity: {lda_results.get('model_perplexity', 'N/A'):.2f}")

LDA analysis completed!
Generated 10 topics
Model perplexity: 264.39


## Step 4: 顯示 Topics 和對應的檔案

In [7]:
def display_topic_details(topic_id, min_probability=0.2):
    topic_info = lda_results['topics'][topic_id]
    
    print(f"\n=== Topic {topic_id}: {topic_info['label']} ===")
    print(f"Coherence Score: {topic_info['coherence']:.3f}")
    
    print("\nTop Words:")
    for word_info in topic_info['top_words'][:10]:
        print(f"  {word_info['word']: <20} {word_info['weight']:.4f}")
    
    topic_files = []
    for doc_idx, doc_topic in enumerate(lda_results['document_topics']):
        if doc_topic['dominant_topic'] == topic_id and doc_topic['dominant_topic_probability'] >= min_probability:
            file_info = file_mapping[doc_idx]
            topic_files.append({
                **file_info,
                'probability': doc_topic['dominant_topic_probability']
            })
    
    topic_files.sort(key=lambda x: x['probability'], reverse=True)
    
    print(f"\nFiles in this topic ({len(topic_files)} files with prob >= {min_probability}):")
    for file_info in topic_files[:15]:
        print(f"  {file_info['file_name']: <30} ({file_info['probability']:.3f})")
    
    if len(topic_files) > 15:
        print(f"  ... and {len(topic_files) - 15} more files")
    
    return topic_files

# 顯示所有 topics
all_topic_files = {}
for i in range(len(lda_results['topics'])):
    topic_files = display_topic_details(i)
    all_topic_files[i] = topic_files


=== Topic 0: Contact + Id + Field ===
Coherence Score: 1137.054

Top Words:
  contact              1751.1399
  id                   1062.6179
  field                1012.8356
  dollar               1007.0941
  email                851.5806
  company              807.1024
  custom               779.6133
  fields               731.5947
  address              652.3607
  update               643.7257

Files in this topic (38 files with prob >= 0.2):
  agileCrm.json                  (0.988)
  clearbit.json                  (0.965)
  zohoCrm.json                   (0.906)
  zohoCrmTool.json               (0.905)
  activeCampaign.json            (0.876)
  agileCrmTool.json              (0.867)
  drift.json                     (0.859)
  uplead.json                    (0.857)
  activeCampaignTool.json        (0.840)
  mailcheck.json                 (0.726)
  sendInBlueTrigger.json         (0.714)
  googleContacts.json            (0.656)
  sort.json                      (0.638)
  driftTool.json

## Step 5: 建立 LDA-Based Taxonomy Tree

In [8]:
class LDABasedTaxonomyBuilder:
    def __init__(self, lda_results, file_mapping, min_probability=0.15):
        self.lda_results = lda_results
        self.file_mapping = file_mapping
        self.min_probability = min_probability
        self.taxonomy_tree = {}
    
    def build_taxonomy(self):
        self.taxonomy_tree = {
            "metadata": {
                "method": "LDA_Based_Taxonomy_n8n_Nodes",
                "data_source": "n8n_node_schemas",
                "n_topics": len(self.lda_results['topics']),
                "min_probability": self.min_probability,
                "total_files": len(self.file_mapping),
                "model_perplexity": self.lda_results.get('model_perplexity', 'N/A')
            },
            "topics": {},
            "unclassified": {
                "files": [],
                "description": "Files that don't meet the minimum probability threshold"
            },
            "statistics": {}
        }
        
        classified_files = set()
        
        for topic_id, topic_info in enumerate(self.lda_results['topics']):
            topic_key = f"topic_{topic_id}"
            topic_files = []
            
            for doc_idx, doc_topic in enumerate(self.lda_results['document_topics']):
                if (doc_topic['dominant_topic'] == topic_id and 
                    doc_topic['dominant_topic_probability'] >= self.min_probability):
                    
                    file_info = self.file_mapping[doc_idx]
                    topic_files.append({
                        "file_path": file_info['file_path'],
                        "file_name": file_info['file_name'],
                        "display_name": file_info['display_name'],
                        "description": file_info['description'],
                        "title": file_info['title'],
                        "probability": doc_topic['dominant_topic_probability'],
                        "all_topic_probs": doc_topic['all_topic_probabilities']
                    })
                    
                    classified_files.add(doc_idx)
            
            topic_files.sort(key=lambda x: x['probability'], reverse=True)
            
            self.taxonomy_tree["topics"][topic_key] = {
                "topic_id": topic_id,
                "label": topic_info['label'],
                "description": f"n8n nodes strongly associated with {topic_info['label']}",
                "files": topic_files,
                "top_words": [w['word'] for w in topic_info['top_words'][:10]],
                "coherence": topic_info['coherence'],
                "file_count": len(topic_files),
                "avg_probability": np.mean([f['probability'] for f in topic_files]) if topic_files else 0
            }
        
        unclassified_files = []
        for doc_idx, file_info in enumerate(self.file_mapping):
            if doc_idx not in classified_files:
                doc_topic = self.lda_results['document_topics'][doc_idx]
                unclassified_files.append({
                    "file_path": file_info['file_path'],
                    "file_name": file_info['file_name'],
                    "display_name": file_info['display_name'],
                    "description": file_info['description'],
                    "title": file_info['title'],
                    "max_probability": max(doc_topic['all_topic_probabilities']),
                    "dominant_topic": doc_topic['dominant_topic'],
                    "reason": "Below minimum probability threshold"
                })
        
        self.taxonomy_tree["unclassified"]["files"] = unclassified_files
        self._generate_statistics()
        
        return self.taxonomy_tree
    
    def _generate_statistics(self):
        total_files = len(self.file_mapping)
        classified_files = sum(topic['file_count'] for topic in self.taxonomy_tree['topics'].values())
        unclassified_files = len(self.taxonomy_tree['unclassified']['files'])
        
        self.taxonomy_tree['statistics'] = {
            "total_files": total_files,
            "classified_files": classified_files,
            "unclassified_files": unclassified_files,
            "classification_rate": (classified_files / total_files) * 100,
            "topics_count": len(self.taxonomy_tree['topics']),
            "avg_files_per_topic": classified_files / len(self.taxonomy_tree['topics']) if self.taxonomy_tree['topics'] else 0
        }

# 建立 LDA-based taxonomy
taxonomy_builder = LDABasedTaxonomyBuilder(lda_results, file_mapping, min_probability=0.15)
lda_taxonomy = taxonomy_builder.build_taxonomy()

print("LDA-Based Taxonomy Tree created successfully!")
print(f"Classification Statistics:")
for key, value in lda_taxonomy['statistics'].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

LDA-Based Taxonomy Tree created successfully!
Classification Statistics:
  total_files: 792
  classified_files: 792
  unclassified_files: 0
  classification_rate: 100.00
  topics_count: 10
  avg_files_per_topic: 79.20


## Step 6: 匯出結果

In [9]:
# 匯出結果
output_dir = project_root / "data" / "output"
output_dir.mkdir(parents=True, exist_ok=True)

def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (bool, int, float, str)) or obj is None:
        return obj
    else:
        return str(obj)

final_taxonomy = make_json_serializable(lda_taxonomy)

# 匯出完整的 taxonomy tree
taxonomy_output_path = output_dir / "n8n_node_taxonomy_tree.json"
with open(taxonomy_output_path, 'w', encoding='utf-8') as f:
    json.dump(final_taxonomy, f, ensure_ascii=False, indent=2)

print(f"✅ n8n node taxonomy tree exported to: {taxonomy_output_path}")

# 匯出簡化版本
simplified_taxonomy = {
    "summary": final_taxonomy['statistics'],
    "metadata": final_taxonomy['metadata'],
    "topics": {
        topic_key: {
            "label": topic_data['label'],
            "file_count": topic_data['file_count'],
            "top_words": topic_data['top_words'][:5],
            "top_files": [f['file_name'] for f in topic_data['files'][:5]]
        }
        for topic_key, topic_data in final_taxonomy['topics'].items()
    }
}

simplified_output_path = output_dir / "n8n_node_taxonomy_summary.json"
with open(simplified_output_path, 'w', encoding='utf-8') as f:
    json.dump(simplified_taxonomy, f, ensure_ascii=False, indent=2)

print(f"📄 Simplified taxonomy summary exported to: {simplified_output_path}")

# 匯出 CSV 對應表
topic_file_mapping = []
for topic_key, topic_data in final_taxonomy['topics'].items():
    for file_info in topic_data['files']:
        topic_file_mapping.append({
            'topic_id': topic_data['topic_id'],
            'topic_label': topic_data['label'],
            'file_name': file_info['file_name'],
            'display_name': file_info['display_name'],
            'probability': file_info['probability']
        })

df_mapping = pd.DataFrame(topic_file_mapping)
csv_output_path = output_dir / "n8n_node_topic_mapping.csv"
df_mapping.to_csv(csv_output_path, index=False, encoding='utf-8')
print(f"📊 Topic-file mapping exported to: {csv_output_path}")

print(f"\n✅ n8n Node Schema LDA Analysis Complete!")
print(f"📊 Generated {final_taxonomy['statistics']['topics_count']} topic-based categories")
print(f"📁 Classified {final_taxonomy['statistics']['classified_files']}/{final_taxonomy['statistics']['total_files']} node schemas ({final_taxonomy['statistics']['classification_rate']:.1f}%)")
print(f"📈 Average node schemas per topic: {final_taxonomy['statistics']['avg_files_per_topic']:.1f}")

✅ n8n node taxonomy tree exported to: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent/data/output/n8n_node_taxonomy_tree.json
📄 Simplified taxonomy summary exported to: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent/data/output/n8n_node_taxonomy_summary.json
📊 Topic-file mapping exported to: /Users/yu/Desktop/projects/gss_cai/n8n_AI_Agent/data/output/n8n_node_topic_mapping.csv

✅ n8n Node Schema LDA Analysis Complete!
📊 Generated 10 topic-based categories
📁 Classified 792/792 node schemas (100.0%)
📈 Average node schemas per topic: 79.2
