Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 67 additions & 74 deletions plugins/core/src/graph/duplicate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,35 @@ interface DuplicateCluster {
export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
const collapse = options.collapse !== false; // Default to true
const threshold = options.simhashThreshold ?? 3;

const exactClusters: DuplicateCluster[] = [];
const nearClusters: DuplicateCluster[] = [];

const nodes = graph.getNodes();
let clusterCounter = 1;

// Phase 1 & 2: Exact Duplicate Detection
const { exactClusters, nearCandidates, nextId: nextId1 } = findExactDuplicates(nodes, clusterCounter);
clusterCounter = nextId1;

// Phase 3: Near Duplicate Detection
const { nearClusters } = findNearDuplicates(nearCandidates, threshold, clusterCounter);

const allClusters = [...exactClusters, ...nearClusters];

// Phase 4, 5, 6: Process Clusters (Template-Heavy, Canonical, Representative)
processClusters(allClusters, graph, collapse);

// Final Edge Transfer if Collapsing
if (collapse) {
collapseEdges(graph);
}
}

function findExactDuplicates(nodes: GraphNode[], startId: number): { exactClusters: DuplicateCluster[], nearCandidates: GraphNode[], nextId: number } {
const exactClusters: DuplicateCluster[] = [];
const nearCandidates: GraphNode[] = [];
const exactMap = new Map<string, GraphNode[]>();
let clusterCounter = startId;

for (const node of nodes) {
if (!node.contentHash || node.status !== 200) continue;

// Safety check: if there's no soft404 signal (soft404 is handled elsewhere, but just filter 200 OKs)
let arr = exactMap.get(node.contentHash);
if (!arr) {
arr = [];
Expand All @@ -41,15 +58,10 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
arr.push(node);
}

// Nodes that are NOT part of an exact duplicate group are candidates for near duplicate checks
const nearCandidates: GraphNode[] = [];
let clusterCounter = 1;

for (const [_hash, group] of exactMap.entries()) {
if (group.length > 1) {
const id = `cluster_exact_${clusterCounter++}`;
exactClusters.push({ id, type: 'exact', nodes: group });
// Mark nodes
for (const n of group) {
n.duplicateClusterId = id;
n.duplicateType = 'exact';
Expand All @@ -59,20 +71,24 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}
}

// Phase 3: Near Duplicate Detection (SimHash with Bands)
// 64-bit simhash -> split into 4 bands of 16 bits.
return { exactClusters, nearCandidates, nextId: clusterCounter };
}

function findNearDuplicates(candidates: GraphNode[], threshold: number, startId: number): { nearClusters: DuplicateCluster[], nextId: number } {
const nearClusters: DuplicateCluster[] = [];
let clusterCounter = startId;

const bandsMaps = [
new Map<number, GraphNode[]>(),
new Map<number, GraphNode[]>(),
new Map<number, GraphNode[]>(),
new Map<number, GraphNode[]>()
];

for (const node of nearCandidates) {
for (const node of candidates) {
if (!node.simhash) continue;
const simhash = BigInt(node.simhash);

// Extract 16 bit bands
const b0 = Number(simhash & 0xFFFFn);
const b1 = Number((simhash >> 16n) & 0xFFFFn);
const b2 = Number((simhash >> 32n) & 0xFFFFn);
Expand All @@ -89,21 +105,18 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}
}

// Find candidate pairs
const nearGroupMap = new Map<string, Set<GraphNode>>(); // node.url -> cluster set
const nearGroupMap = new Map<string, Set<GraphNode>>();
const checkedPairs = new Set<string>();

for (let i = 0; i < 4; i++) {
for (const [_bandVal, bucketNodes] of bandsMaps[i].entries()) {
if (bucketNodes.length < 2) continue; // nothing to compare
if (bucketNodes.length < 2) continue;

// Compare all nodes in this bucket
for (let j = 0; j < bucketNodes.length; j++) {
for (let k = j + 1; k < bucketNodes.length; k++) {
const n1 = bucketNodes[j];
const n2 = bucketNodes[k];

// Ensure n1 < n2 lexicographically to avoid duplicate pairs
const [a, b] = n1.url < n2.url ? [n1, n2] : [n2, n1];
const pairKey = Graph.getEdgeKey(a.url, b.url);

Expand All @@ -112,8 +125,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {

const dist = SimHash.hammingDistance(BigInt(a.simhash!), BigInt(b.simhash!));
if (dist <= threshold) {
// They are near duplicates.
// Find or create their cluster set using union-find or reference propagation
const setA = nearGroupMap.get(a.url);
const setB = nearGroupMap.get(b.url);

Expand All @@ -128,7 +139,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
setB.add(a);
nearGroupMap.set(a.url, setB);
} else if (setA && setB && setA !== setB) {
// Merge sets
for (const node of setB) {
setA.add(node);
nearGroupMap.set(node.url, setA);
Expand All @@ -140,7 +150,6 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}
}

// Compile near duplicate clusters (deduplicated by Set reference)
const uniqueNearSets = new Set<Set<GraphNode>>();
for (const group of nearGroupMap.values()) {
uniqueNearSets.add(group);
Expand All @@ -158,11 +167,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}
}

const allClusters = [...exactClusters, ...nearClusters];
return { nearClusters, nextId: clusterCounter };
}

function processClusters(clusters: DuplicateCluster[], graph: Graph, collapse: boolean) {
// Phase 4: Template-Heavy Detection
// Mark classes as 'template_heavy' if ratio < 0.3
for (const cluster of allClusters) {
for (const cluster of clusters) {
const avgRatio = cluster.nodes.reduce((sum, n) => sum + (n.uniqueTokenRatio || 0), 0) / cluster.nodes.length;
if (avgRatio < 0.3) {
cluster.type = 'template_heavy';
Expand All @@ -171,13 +181,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}

// Phase 5: Canonical Conflict & Representative Selection
for (const cluster of allClusters) {
for (const cluster of clusters) {
const canonicals = new Set<string>();
let hasMissing = false;

for (const n of cluster.nodes) {
if (!n.canonical) hasMissing = true;
// We compare full absolute canonical URLs (assuming they are normalized during crawl)
else canonicals.add(n.canonical);
}

Expand All @@ -190,18 +199,12 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}

// Phase 6: Select Representative
// 1. Valid Canonical target in cluster
// 2. Highest internal in-degree
// 3. Shortest URL
// 4. First discovered (relying on array order, which is from BFS map roughly)
let representativeNode = cluster.nodes[0];

// Evaluate best rep
const urlsInCluster = new Set(cluster.nodes.map(n => n.url));
const validCanonicals = cluster.nodes.filter(n => n.canonical && urlsInCluster.has(n.canonical) && n.url === n.canonical);

if (validCanonicals.length > 0) {
representativeNode = validCanonicals[0]; // If multiple, just pick first matching self
representativeNode = validCanonicals[0];
} else {
representativeNode = cluster.nodes.reduce((best, current) => {
if (current.inLinks > best.inLinks) return current;
Expand All @@ -215,11 +218,10 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {

cluster.nodes.forEach(n => {
n.isClusterPrimary = n.url === representativeNode.url;
n.isCollapsed = false; // default for JSON
n.isCollapsed = false;
n.collapseInto = undefined;
});

// Push to Graph's final cluster list
graph.duplicateClusters.push({
id: cluster.id,
type: cluster.type,
Expand All @@ -238,49 +240,40 @@ export function detectDuplicates(graph: Graph, options: DuplicateOptions = {}) {
}
}
}
}

// Final Edge Transfer if Collapsing
if (collapse) {
const edges = graph.getEdges();
const updatedEdges = new Map<string, number>();

for (const edge of edges) {
const sourceNode = graph.nodes.get(edge.source);
const targetNode = graph.nodes.get(edge.target);
function collapseEdges(graph: Graph) {
const edges = graph.getEdges();
const updatedEdges = new Map<string, number>();

if (!sourceNode || !targetNode) continue;
for (const edge of edges) {
const sourceNode = graph.nodes.get(edge.source);
const targetNode = graph.nodes.get(edge.target);

// We do NOT modify source structure for out-bound edges of collapsed nodes?
// Spec: "Ignore edges from collapsed nodes. Transfer inbound edges to representative."
// Actually, if a node links TO a collapsed node, we repoint the edge to the representative.
// If a collapsed node links to X, we ignore it (PageRank will filter it out).
if (!sourceNode || !targetNode) continue;

const actualSource = edge.source;
// repoint target
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;
const actualSource = edge.source;
const actualTarget = targetNode.isCollapsed && targetNode.collapseInto ? targetNode.collapseInto : edge.target;

// Skip self-referential edges caused by repointing
if (actualSource === actualTarget) continue;
if (actualSource === actualTarget) continue;

const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
const existingWeight = updatedEdges.get(edgeKey) || 0;
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight)); // deduplicate
}
const edgeKey = Graph.getEdgeKey(actualSource, actualTarget);
const existingWeight = updatedEdges.get(edgeKey) || 0;
updatedEdges.set(edgeKey, Math.max(existingWeight, edge.weight));
}

// Update graph edges in-place
graph.edges = updatedEdges;
graph.edges = updatedEdges;

// Re-calculate inLinks and outLinks based on collapsed edges
for (const node of graph.getNodes()) {
node.inLinks = 0;
node.outLinks = 0;
}
for (const [edgeKey, _weight] of updatedEdges.entries()) {
const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
const sn = graph.nodes.get(src);
const tn = graph.nodes.get(tgt);
if (sn) sn.outLinks++;
if (tn) tn.inLinks++;
}
// Re-calculate inLinks and outLinks based on collapsed edges
for (const node of graph.getNodes()) {
node.inLinks = 0;
node.outLinks = 0;
}
for (const [edgeKey, _weight] of updatedEdges.entries()) {
const { source: src, target: tgt } = Graph.parseEdgeKey(edgeKey);
const sn = graph.nodes.get(src);
const tn = graph.nodes.get(tgt);
if (sn) sn.outLinks++;
if (tn) tn.inLinks++;
}
}
Loading