diff --git a/plugins/core/src/graph/duplicate.ts b/plugins/core/src/graph/duplicate.ts index 9301226..87a5501 100644 --- a/plugins/core/src/graph/duplicate.ts +++ b/plugins/core/src/graph/duplicate.ts @@ -21,18 +21,35 @@ interface DuplicateCluster { export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { const collapse = options.collapse !== false; // Default to true const threshold = options.simhashThreshold ?? 3; - - const exactClusters: DuplicateCluster[] = []; - const nearClusters: DuplicateCluster[] = []; - const nodes = graph.getNodes(); + let clusterCounter = 1; // Phase 1 & 2: Exact Duplicate Detection + const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter); + clusterCounter = nextId1; + + // Phase 3: Near Duplicate Detection + const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter); + + const allClusters = [...exactClusters, ...nearClusters]; + + // Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative) + processClusters(allClusters, graph, collapse); + + // Final Edge Transfer if Collapsing + if (collapse) { + collapseEdges(graph); + } +} + +function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } { + const exactClusters: DuplicateCluster[] = []; + const nearCandidates: GraphNode[] = []; const exactMap = new Map(); + let clusterCounter = startId; + for (const node of nodes) { if (!node.contentHash || node.status !== 200) continue; - - // Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs) let arr = exactMap.get(node.contentHash); if (!arr) { arr = []; @@ -41,15 +58,10 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { arr.push(node); } - // Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks - const nearCandidates: GraphNode[] = []; - let clusterCounter = 1; - for (const [_hash, group] of exactMap.entries()) { if (group.length > 1) { const id = `cluster_exact_${clusterCounter++}`; exactClusters.push({ id, type: 'exact', nodes: group }); - // Mark nodes for (const n of group) { n.duplicateClusterId = id; n.duplicateType = 'exact'; @@ -59,8 +71,13 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } } - // Phase 3: Near Duplicate Detection (SimHash with Bands) - // 64-bit simhash -> split into 4 bands of 16 bits. + return { exactClusters, nearCandidates, nextId: clusterCounter }; +} + +function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } { + const nearClusters: DuplicateCluster[] = []; + let clusterCounter = startId; + const bandsMaps = [ new Map(), new Map(), @@ -68,11 +85,10 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { new Map() ]; - for (const node of nearCandidates) { + for (const node of candidates) { if (!node.simhash) continue; const simhash = BigInt(node.simhash); - // Extract 16 bit bands const b0 = Number(simhash & 0xFFFFn); const b1 = Number((simhash >> 16n) & 0xFFFFn); const b2 = Number((simhash >> 32n) & 0xFFFFn); @@ -89,21 +105,18 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } } - // Find candidate pairs - const nearGroupMap = new Map>(); // node.url -> cluster set + const nearGroupMap = new Map>(); const checkedPairs = new Set(); for (let i = 0; i < 4; i++) { for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) { - if (bucketNodes.length < 2) continue; // nothing to compare + if (bucketNodes.length < 2) continue; - // Compare all nodes in this bucket for (let j = 0; j < bucketNodes.length; j++) { for (let k = j + 1; k < bucketNodes.length; k++) { const n1 = bucketNodes[j]; const n2 = bucketNodes[k]; - // Ensure n1 < n2 lexicographically to avoid duplicate pairs const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1]; const pairKey = Graph.getEdgeKey(a.url, b.url); @@ -112,8 +125,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { const dist = SimHash.hammingDistance(BigInt(a.simhash!), BigInt(b.simhash!)); if (dist <= threshold) { - // They are near duplicates. - // Find or create their cluster set using union-find or reference propagation const setA = nearGroupMap.get(a.url); const setB = nearGroupMap.get(b.url); @@ -128,7 +139,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { setB.add(a); nearGroupMap.set(a.url, setB); } else if (setA && setB && setA !== setB) { - // Merge sets for (const node of setB) { setA.add(node); nearGroupMap.set(node.url, setA); @@ -140,7 +150,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } } - // Compile near duplicate clusters (deduplicated by Set reference) const uniqueNearSets = new Set>(); for (const group of nearGroupMap.values()) { uniqueNearSets.add(group); @@ -158,11 +167,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } } - const allClusters = [...exactClusters, ...nearClusters]; + return { nearClusters, nextId: clusterCounter }; +} +function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) { // Phase 4: Template-Heavy Detection - // Mark classes as 'template_heavy' if ratio < 0.3 - for (const cluster of allClusters) { + for (const cluster of clusters) { const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length; if (avgRatio < 0.3) { cluster.type = 'template_heavy'; @@ -171,13 +181,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } // Phase 5: Canonical Conflict & Representative Selection - for (const cluster of allClusters) { + for (const cluster of clusters) { const canonicals = new Set(); let hasMissing = false; for (const n of cluster.nodes) { if (!n.canonical) hasMissing = true; - // We compare full absolute canonical URLs (assuming they are normalized during crawl) else canonicals.add(n.canonical); } @@ -190,18 +199,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } // Phase 6: Select Representative - // 1. Valid Canonical target in cluster - // 2. Highest internal in-degree - // 3. Shortest URL - // 4. First discovered (relying on array order, which is from BFS map roughly) let representativeNode = cluster.nodes[0]; - - // Evaluate best rep const urlsInCluster = new Set(cluster.nodes.map(n => n.url)); const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical); if (validCanonicals.length > 0) { - representativeNode = validCanonicals[0]; // If multiple, just pick first matching self + representativeNode = validCanonicals[0]; } else { representativeNode = cluster.nodes.reduce((best, current) => { if (current.inLinks > best.inLinks) return current; @@ -215,11 +218,10 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { cluster.nodes.forEach(n => { n.isClusterPrimary = n.url === representativeNode.url; - n.isCollapsed = false; // default for JSON + n.isCollapsed = false; n.collapseInto = undefined; }); - // Push to Graph's final cluster list graph.duplicateClusters.push({ id: cluster.id, type: cluster.type, @@ -238,49 +240,40 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) { } } } +} - // Final Edge Transfer if Collapsing - if (collapse) { - const edges = graph.getEdges(); - const updatedEdges = new Map(); - - for (const edge of edges) { - const sourceNode = graph.nodes.get(edge.source); - const targetNode = graph.nodes.get(edge.target); +function collapseEdges(graph: Graph) { + const edges = graph.getEdges(); + const updatedEdges = new Map(); - if (!sourceNode || !targetNode) continue; + for (const edge of edges) { + const sourceNode = graph.nodes.get(edge.source); + const targetNode = graph.nodes.get(edge.target); - // We do NOT modify source structure for out-bound edges of collapsed nodes? - // Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative." - // Actually, if a node links TO a collapsed node, we repoint the edge to the representative. - // If a collapsed node links to X, we ignore it (PageRank will filter it out). + if (!sourceNode || !targetNode) continue; - const actualSource = edge.source; - // repoint target - const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target; + const actualSource = edge.source; + const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target; - // Skip self-referential edges caused by repointing - if (actualSource === actualTarget) continue; + if (actualSource === actualTarget) continue; - const edgeKey = Graph.getEdgeKey(actualSource, actualTarget); - const existingWeight = updatedEdges.get(edgeKey) || 0; - updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate - } + const edgeKey = Graph.getEdgeKey(actualSource, actualTarget); + const existingWeight = updatedEdges.get(edgeKey) || 0; + updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); + } - // Update graph edges in-place - graph.edges = updatedEdges; + graph.edges = updatedEdges; - // Re-calculate inLinks and outLinks based on collapsed edges - for (const node of graph.getNodes()) { - node.inLinks = 0; - node.outLinks = 0; - } - for (const [edgeKey, _weight] of updatedEdges.entries()) { - const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey); - const sn = graph.nodes.get(src); - const tn = graph.nodes.get(tgt); - if (sn) sn.outLinks++; - if (tn) tn.inLinks++; - } + // Re-calculate inLinks and outLinks based on collapsed edges + for (const node of graph.getNodes()) { + node.inLinks = 0; + node.outLinks = 0; + } + for (const [edgeKey, _weight] of updatedEdges.entries()) { + const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey); + const sn = graph.nodes.get(src); + const tn = graph.nodes.get(tgt); + if (sn) sn.outLinks++; + if (tn) tn.inLinks++; } }