From c9497dfabff240787aa0f5ac7a8f4ad70117ea72 Mon Sep 17 00:00:00 2001 From: Bruno Roustant <33934988+bruno-roustant@users.noreply.github.com> Date: Tue, 1 Aug 2023 12:31:10 +0200 Subject: [PATCH] New KTypePgmIndex that learns a compact index on sorted keys and supports range search. (#39) Co-authored-by: Dawid Weiss --- build.gradle | 2 +- hppc-benchmarks/build.gradle | 6 + .../carrotsearch/hppc/benchmarks/Library.java | 12 + .../implementations/PgmIntSetOps.java | 62 ++ .../com/carrotsearch/hppc/Intrinsics.java | 34 + .../hppc/generator/TemplateProcessor.java | 2 + .../hppc/generator/intrinsics/Numeric.java | 31 + .../carrotsearch/hppc/IntGrowableArray.java | 59 ++ .../com/carrotsearch/hppc/PgmIndexUtil.java | 122 ++++ .../java/com/carrotsearch/hppc/PlaModel.java | 406 ++++++++++++ .../carrotsearch/hppc/RamUsageEstimator.java | 46 +- .../com/carrotsearch/hppc/KTypePgmIndex.java | 589 ++++++++++++++++++ .../carrotsearch/hppc/KTypePgmIndexTest.java | 199 ++++++ versions.lock | 10 +- versions.props | 2 +- 15 files changed, 1538 insertions(+), 44 deletions(-) create mode 100644 hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/implementations/PgmIntSetOps.java create mode 100644 hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/intrinsics/Numeric.java create mode 100644 hppc/src/main/java/com/carrotsearch/hppc/IntGrowableArray.java create mode 100644 hppc/src/main/java/com/carrotsearch/hppc/PgmIndexUtil.java create mode 100644 hppc/src/main/java/com/carrotsearch/hppc/PlaModel.java create mode 100644 hppc/src/main/templates/com/carrotsearch/hppc/KTypePgmIndex.java create mode 100644 hppc/src/test/templates/com/carrotsearch/hppc/KTypePgmIndexTest.java diff --git a/build.gradle b/build.gradle index dc0366393..07548220c 100644 --- a/build.gradle +++ b/build.gradle @@ -8,7 +8,7 @@ plugins { id 'de.thetaphi.forbiddenapis' version '3.2' apply false - id "me.champeau.gradle.jmh" version "0.5.0" apply false + id "me.champeau.gradle.jmh" version "0.5.3" apply false } rootProject.version = '0.10.0-SNAPSHOT' diff --git a/hppc-benchmarks/build.gradle b/hppc-benchmarks/build.gradle index 0c95f6fe5..b3d4f2c55 100644 --- a/hppc-benchmarks/build.gradle +++ b/hppc-benchmarks/build.gradle @@ -18,6 +18,12 @@ jmh { duplicateClassesStrategy = DuplicatesStrategy.WARN } +jmhJar { + duplicatesStrategy = DuplicatesStrategy.WARN + exclude 'LICENSE' + exclude 'THIRD-PARTY' +} + task benchmark() { dependsOn jmhJar diff --git a/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/Library.java b/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/Library.java index 4cd80a857..8d0d3edb5 100644 --- a/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/Library.java +++ b/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/Library.java @@ -37,6 +37,18 @@ public IntIntMapOps newIntIntMap(int expectedElements, double loadFactor) { } }, + PGM { + @Override + public IntSetOps newIntSet(int expectedElements, double loadFactor) { + return new PgmIntSetOps(64, 32); + } + + @Override + public IntIntMapOps newIntIntMap(int expectedElements, double loadFactor) { + throw new UnsupportedOperationException(); + } + }, + FASTUTIL { @Override public IntSetOps newIntSet(int expectedElements, double loadFactor) { diff --git a/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/implementations/PgmIntSetOps.java b/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/implementations/PgmIntSetOps.java new file mode 100644 index 000000000..9d232baf2 --- /dev/null +++ b/hppc-benchmarks/src/jmh/java/com/carrotsearch/hppc/benchmarks/implementations/PgmIntSetOps.java @@ -0,0 +1,62 @@ +/* + * HPPC + * + * Copyright (C) 2010-2022 Carrot Search s.c. + * All rights reserved. + * + * Refer to the full license file "LICENSE.txt": + * https://github.com/carrotsearch/hppc/blob/master/LICENSE.txt + */ +package com.carrotsearch.hppc.benchmarks.implementations; + +import com.carrotsearch.hppc.IntPgmIndex; +import com.carrotsearch.hppc.benchmarks.IntSetOps; +import java.util.Arrays; + +public class PgmIntSetOps implements IntSetOps { + private IntPgmIndex.IntBuilder builder; + private int[] keys; + private IntPgmIndex delegate; + + public PgmIntSetOps(int epsilon, int recursiveEpsilon) { + builder = + new IntPgmIndex.IntBuilder().setEpsilon(epsilon).setEpsilonRecursive(recursiveEpsilon); + } + + @Override + public void add(int key) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean contains(int key) { + return delegate.contains(key); + } + + @Override + public void bulkAdd(int[] keys) { + if (this.keys != null) { + throw new UnsupportedOperationException("bulkAdd() can be called only once"); + } + this.keys = keys; + Arrays.sort(keys); + delegate = builder.setSortedKeys(keys, keys.length).build(); + builder = null; + } + + @Override + public int bulkContains(int[] keys) { + int v = 0; + for (int key : keys) { + if (delegate.contains(key)) { + v++; + } + } + return v; + } + + @Override + public int[] iterationOrderArray() { + return keys; + } +} diff --git a/hppc-template-intrinsics/src/main/java/com/carrotsearch/hppc/Intrinsics.java b/hppc-template-intrinsics/src/main/java/com/carrotsearch/hppc/Intrinsics.java index cf9bdd3ca..10039b919 100644 --- a/hppc-template-intrinsics/src/main/java/com/carrotsearch/hppc/Intrinsics.java +++ b/hppc-template-intrinsics/src/main/java/com/carrotsearch/hppc/Intrinsics.java @@ -137,4 +137,38 @@ public static T add(T op1, T op2) { throw new UnsupportedOperationException("Invalid for arbitrary types: " + op1 + " " + op2); } + + /** + * Returns the numerical value for the argument if it is a primitive template type. This intrinsic + * method always returns a {@code double} result for direct calls, but the template preprocessor + * will replace this method invocation with the exact type equal to the template type. So a call + * to: + * + *
+   * {@code Intrinsics.numeric(key)}
+   * 
+ * + * with template type {@code KType} equal to {@code int} will return the raw key value (without + * any type conversion): + * + *
+   * {@code (key))
+   * 
+ * + *

This intrinsic is used to apply arithmetic operations on keys. It is invalid for generic + * types. + */ + public static double numeric(T e) { + if (e instanceof Byte + | e instanceof Character + | e instanceof Short + | e instanceof Integer + | e instanceof Float + | e instanceof Long + | e instanceof Double) { + return (double) e; + } + + throw new UnsupportedOperationException("Invalid for generic type: " + e); + } } diff --git a/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/TemplateProcessor.java b/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/TemplateProcessor.java index 052f8a520..a7b55df8d 100644 --- a/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/TemplateProcessor.java +++ b/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/TemplateProcessor.java @@ -21,6 +21,7 @@ import com.carrotsearch.hppc.generator.intrinsics.Equals; import com.carrotsearch.hppc.generator.intrinsics.IsEmpty; import com.carrotsearch.hppc.generator.intrinsics.NewArray; +import com.carrotsearch.hppc.generator.intrinsics.Numeric; import com.carrotsearch.hppc.generator.parser.SignatureProcessor; import java.io.IOException; import java.io.StringWriter; @@ -58,6 +59,7 @@ public class TemplateProcessor extends Command { intrinsics.put("cast", new Cast()); intrinsics.put("add", new Add()); intrinsics.put("equals", new Equals()); + intrinsics.put("numeric", new Numeric()); } @Parameter(names = {"--incremental"}) diff --git a/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/intrinsics/Numeric.java b/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/intrinsics/Numeric.java new file mode 100644 index 000000000..c71ec3eef --- /dev/null +++ b/hppc-template-processor/src/main/java/com/carrotsearch/hppc/generator/intrinsics/Numeric.java @@ -0,0 +1,31 @@ +/* + * HPPC + * + * Copyright (C) 2010-2022 Carrot Search s.c. + * All rights reserved. + * + * Refer to the full license file "LICENSE.txt": + * https://github.com/carrotsearch/hppc/blob/master/LICENSE.txt + */ +package com.carrotsearch.hppc.generator.intrinsics; + +import com.carrotsearch.hppc.generator.TemplateOptions; +import com.carrotsearch.hppc.generator.Type; +import java.util.ArrayList; +import java.util.regex.Matcher; + +public class Numeric extends AbstractIntrinsicMethod { + @Override + public void invoke( + Matcher m, + StringBuilder sb, + TemplateOptions templateOptions, + String genericCast, + ArrayList params) { + expectArgumentCount(m, params, 1); + if (inferTemplateType(m, templateOptions, genericCast) == Type.GENERIC) { + throw new RuntimeException("Can't get the numeric value of generic types: " + m.group()); + } + sb.append(params.get(0)); + } +} diff --git a/hppc/src/main/java/com/carrotsearch/hppc/IntGrowableArray.java b/hppc/src/main/java/com/carrotsearch/hppc/IntGrowableArray.java new file mode 100644 index 000000000..7b7a57511 --- /dev/null +++ b/hppc/src/main/java/com/carrotsearch/hppc/IntGrowableArray.java @@ -0,0 +1,59 @@ +/* + * HPPC + * + * Copyright (C) 2010-2022 Carrot Search s.c. + * All rights reserved. + * + * Refer to the full license file "LICENSE.txt": + * https://github.com/carrotsearch/hppc/blob/master/LICENSE.txt + */ +package com.carrotsearch.hppc; + +import java.util.Arrays; + +/** + * Basic growable int array helper for HPPC templates (so before {@code IntArrayList} is generated). + */ +public class IntGrowableArray implements Accountable { + + public int[] buffer; + public int size; + + public IntGrowableArray(int initialCapacity) { + buffer = new int[initialCapacity]; + } + + public void add(int e) { + ensureBufferSpace(1); + buffer[size++] = e; + } + + public int[] toArray() { + return buffer.length == size ? buffer : Arrays.copyOf(buffer, size); + } + + private void ensureBufferSpace(int expectedAdditions) { + if (size + expectedAdditions > buffer.length) { + int newSize = + BoundedProportionalArraySizingStrategy.DEFAULT_INSTANCE.grow( + buffer.length, size, expectedAdditions); + buffer = Arrays.copyOf(buffer, newSize); + } + } + + @Override + public long ramBytesAllocated() { + // int: size + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + Integer.BYTES + + RamUsageEstimator.shallowSizeOfArray(buffer); + } + + @Override + public long ramBytesUsed() { + // int: size + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + Integer.BYTES + + RamUsageEstimator.shallowUsedSizeOfArray(buffer, size); + } +} diff --git a/hppc/src/main/java/com/carrotsearch/hppc/PgmIndexUtil.java b/hppc/src/main/java/com/carrotsearch/hppc/PgmIndexUtil.java new file mode 100644 index 000000000..6e4da10a6 --- /dev/null +++ b/hppc/src/main/java/com/carrotsearch/hppc/PgmIndexUtil.java @@ -0,0 +1,122 @@ +/* + * HPPC + * + * Copyright (C) 2010-2022 Carrot Search s.c. + * All rights reserved. + * + * Refer to the full license file "LICENSE.txt": + * https://github.com/carrotsearch/hppc/blob/master/LICENSE.txt + */ +package com.carrotsearch.hppc; + +/** Utility methods for {@code KTypePgmIndex}. */ +class PgmIndexUtil { + + /** Adds the first key of the current segment to the segment data bytes. */ + static void addKey(KType key, IntGrowableArray segmentData) { + throw new UnsupportedOperationException("Invalid for generic type: " + key); + } + + /** Adds the first key of the current segment to the segment data bytes. */ + static void addKey(int key, IntGrowableArray segmentData) { + segmentData.add(key); + } + + /** Adds the first key of the current segment to the segment data bytes. */ + static void addKey(float key, IntGrowableArray segmentData) { + addKey(Float.floatToIntBits(key), segmentData); + } + + /** Adds the first key of the current segment to the segment data bytes. */ + static void addKey(long key, IntGrowableArray segmentData) { + segmentData.add((int) key); + segmentData.add((int) (key >> 32)); + } + + /** Adds the first key of the current segment to the segment data bytes. */ + static void addKey(double key, IntGrowableArray segmentData) { + addKey(Double.doubleToRawLongBits(key), segmentData); + } + + /** Gets the first key of the segment at the given data index. */ + static KType getKey(int segmentDataIndex, int[] segmentData, KType keyType) { + throw new UnsupportedOperationException("Invalid for generic type: " + keyType); + } + + /** Gets the first key of the segment at the given data index. */ + static int getKey(int segmentDataIndex, int[] segmentData, int keyType) { + return segmentData[segmentDataIndex]; + } + + /** Gets the first key of the segment at the given data index. */ + static float getKey(int segmentDataIndex, int[] segmentData, float keyType) { + return Float.intBitsToFloat(getKey(segmentDataIndex, segmentData, 0)); + } + + /** Gets the first key of the segment at the given data index. */ + static long getKey(int segmentDataIndex, int[] segmentData, long keyType) { + return (segmentData[segmentDataIndex] & 0xFFFFFFFFL) + | (((long) segmentData[segmentDataIndex + 1]) << 32); + } + + /** Gets the first key of the segment at the given data index. */ + static double getKey(int segmentDataIndex, int[] segmentData, double keyType) { + return Double.longBitsToDouble(getKey(segmentDataIndex, segmentData, 0L)); + } + + /** + * Adds the intercept of the current segment to the segment data bytes. The intercept is stored as + * an int for a key size equal to 1, otherwise it is stored as a long. + * + * @param keySize The size of the key, measure in {@link Integer#BYTES}. + */ + static void addIntercept(long intercept, IntGrowableArray segmentData, int keySize) { + assert keySize >= 1 && keySize <= 2; + if (keySize == 1) { + addKey((int) intercept, segmentData); + } else { + addKey(intercept, segmentData); + } + } + + /** + * Gets the intercept of the segment at the given data index. + * + * @param keySize The size of the key, measure in {@link Integer#BYTES}. + */ + static long getIntercept(int segmentDataIndex, int[] segmentData, int keySize) { + assert keySize >= 1 && keySize <= 2; + if (keySize == 1) { + return getKey(segmentDataIndex, segmentData, 0); + } + return getKey(segmentDataIndex, segmentData, 0L); + } + + /** + * Adds the slope of the current segment to the segment data bytes. The intercept is stored as a + * float for a key size equal to 1, otherwise it is stored as a double. + * + * @param keySize The size of the key, measure in {@link Integer#BYTES}. + */ + static void addSlope(double slope, IntGrowableArray segmentData, int keySize) { + assert keySize >= 1 && keySize <= 2; + if (keySize == 1) { + addKey((float) slope, segmentData); + } else { + addKey(slope, segmentData); + } + } + + /** + * Gets the slope of the segment at the given data index. + * + * @param keySize The size of the key, measure in {@link Integer#BYTES}. + */ + static double getSlope(int segmentDataIndex, int[] segmentData, int keySize) { + assert keySize >= 1 && keySize <= 2; + if (keySize == 1) { + return getKey(segmentDataIndex, segmentData, 0f); + } + return getKey(segmentDataIndex, segmentData, 0d); + } +} diff --git a/hppc/src/main/java/com/carrotsearch/hppc/PlaModel.java b/hppc/src/main/java/com/carrotsearch/hppc/PlaModel.java new file mode 100644 index 000000000..2da9b14f1 --- /dev/null +++ b/hppc/src/main/java/com/carrotsearch/hppc/PlaModel.java @@ -0,0 +1,406 @@ +/* + * HPPC + * + * Copyright (C) 2010-2022 Carrot Search s.c. + * All rights reserved. + * + * Refer to the full license file "LICENSE.txt": + * https://github.com/carrotsearch/hppc/blob/master/LICENSE.txt + */ +package com.carrotsearch.hppc; + +import java.util.Arrays; + +/** + * Optimal Piecewise Linear Approximation Model for KType keys. + * + *

Learns a mapping that returns a position for a KType key which is at most epsilon + * away from the correct one in a sorted list of keys. It is optimal and piecewise because it learns + * the minimum number of epsilon-approximate segments. + * + *

The PLA-model consists of a sequence segments. A segment s is a triple (key,slope,intercept) + * that indexes a range of keys through the function fs(k) = k × slope + intercept, which provides + * an epsilon-approximation of the position of the key k. + */ +public class PlaModel implements Accountable { + + /** Initial capacity of the lower and upper point lists. */ + private static final int INITIAL_CAPACITY = 1 << 8; + + /** Epsilon precision of the PLA-model. */ + private int epsilon; + /** First key of the current segment. */ + private double firstKey; + /** Previous key used to check that keys are added in strictly increasing sequence. */ + private double previousKey; + /** Number of points in the convex hull for the current segment. */ + private int numPointsInHull; + /** Enclosing rectangle for the current segment. */ + private final Point[] rect = new Point[4]; + /** + * Ordered list of lower points for the current segment. Inside the list, allocated points are + * re-used. + */ + private final PointList lower = new PointList(INITIAL_CAPACITY); + /** + * Ordered list of upper points for the current segment. Inside the list, allocated points are + * re-used. + */ + private final PointList upper = new PointList(INITIAL_CAPACITY); + /** Index of the first lower point to compare to. */ + private int lowerStart; + /** Index of the first upper point to compare to. */ + private int upperStart; + + // Re-used mutable points and slopes. + private final Point point1 = new Point(); + private final Point point2 = new Point(); + private final Slope slope1 = new Slope(); + private final Slope slope2 = new Slope(); + private final Slope slopeTmp = new Slope(); + private final Slope slopeMin = new Slope(); + private final Slope slopeMax = new Slope(); + + /** + * Creates an optimal PLA-model with the provided epsilon precision. + * + * @param epsilon must be greater than or equal to 0. + */ + public PlaModel(int epsilon) { + setEpsilon(epsilon); + for (int i = 0; i < rect.length; i++) { + rect[i] = new Point(); + } + reset(); + } + + /** Sets epsilon precision which must be greater than or equal to 0. */ + public void setEpsilon(int epsilon) { + if (epsilon < 0) { + throw new IllegalArgumentException("epsilon must be >= 0"); + } + this.epsilon = epsilon; + } + + private void reset() { + previousKey = Double.NEGATIVE_INFINITY; + numPointsInHull = 0; + lower.clear(); + upper.clear(); + } + + /** + * Adds a key to this PLA-model. The keys must be provided in a strictly increasing sequence. That + * is, the key must be greater than the previous key. + * + * @param index The index of the key in the sorted key list. + * @param segmentConsumer The consumer to call when a new segment is built in the PLA-model. + */ + public void addKey(double key, int index, SegmentConsumer segmentConsumer) { + if (key <= previousKey) { + throw new IllegalArgumentException("Keys must be increasing"); + } + previousKey = key; + point1.set(key, addEpsilon(index)); + point2.set(key, subtractEpsilon(index)); + + if (numPointsInHull > 1) { + slope1.set(rect[0], rect[2]); + slope2.set(rect[1], rect[3]); + boolean outside_line1 = slopeTmp.set(rect[2], point1).isLessThan(slope1); + boolean outside_line2 = slopeTmp.set(rect[3], point2).isGreaterThan(slope2); + if (outside_line1 || outside_line2) { + produceSegment(segmentConsumer); + numPointsInHull = 0; + } + } + if (numPointsInHull == 0) { + firstKey = key; + rect[0].set(point1); + rect[1].set(point2); + upper.clear(); + lower.clear(); + upper.add(point1); + lower.add(point2); + upperStart = lowerStart = 0; + numPointsInHull++; + return; + } + if (numPointsInHull == 1) { + rect[2].set(point2); + rect[3].set(point1); + upper.add(point1); + lower.add(point2); + numPointsInHull++; + return; + } + + if (slopeTmp.set(rect[1], point1).isLessThan(slope2)) { + // Find extreme slope. + slopeMin.set(point1, lower.get(lowerStart)); + int min_i = lowerStart; + for (int i = lowerStart + 1; i < lower.size(); i++) { + slopeTmp.set(point1, lower.get(i)); + if (slopeTmp.isGreaterThan(slopeMin)) { + break; + } + slopeMin.set(slopeTmp); + min_i = i; + } + rect[1].set(lower.get(min_i)); + rect[3].set(point1); + lowerStart = min_i; + + // Hull update. + int end = upper.size(); + while (end >= upperStart + 2 && cross(upper.get(end - 2), upper.get(end - 1), point1) <= 0) { + end--; + } + upper.clearFrom(end); + upper.add(point1); + } + + if (slopeTmp.set(rect[0], point2).isGreaterThan(slope1)) { + // Find extreme slope. + slopeMax.set(point2, upper.get(upperStart)); + int max_i = upperStart; + for (int i = upperStart + 1; i < upper.size(); i++) { + slopeTmp.set(point2, upper.get(i)); + if (slopeTmp.isLessThan(slopeMax)) { + break; + } + slopeMax.set(slopeTmp); + max_i = i; + } + rect[0].set(upper.get(max_i)); + rect[2].set(point2); + upperStart = max_i; + + // Hull update. + int end = lower.size(); + while (end >= lowerStart + 2 && cross(lower.get(end - 2), lower.get(end - 1), point2) >= 0) { + end--; + } + lower.clearFrom(end); + lower.add(point2); + } + + numPointsInHull++; + } + + private void produceSegment(SegmentConsumer segmentConsumer) { + double slope; + long intercept; + + if (numPointsInHull == 1) { + slope = 0d; + intercept = ((long) rect[0].y + rect[1].y) >>> 1; + + } else { + Point p0 = rect[0]; + Point p1 = rect[1]; + Point p2 = rect[2]; + Point p3 = rect[3]; + + // Compute the slope intersection point. + double intersectX; + double intersectY; + slope1.set(p0, p2); + slope2.set(p1, p3); + if (slope1.isEqual(slope2)) { + intersectX = p0.x; + intersectY = p0.y; + } else { + slopeTmp.set(p0, p1); + double a = slope1.dx * slope2.dy - slope1.dy * slope2.dx; + double b = (slopeTmp.dx * slope2.dy - slopeTmp.dy * slope2.dx) / a; + intersectX = p0.x + b * slope1.dx; + intersectY = p0.y + b * slope1.dy; + } + + // Compute the slope range. + double minSlope = Slope.asDouble(p0, p2); + double maxSlope = Slope.asDouble(p1, p3); + + // Compute the segment slope and intercept. + slope = (minSlope + maxSlope) / 2d; + intercept = (long) (intersectY - (intersectX - firstKey) * slope); + } + + segmentConsumer.accept(firstKey, slope, intercept); + } + + /** + * Finishes the PLA-model construction. Declares that no additional keys will be added. Builds the + * last segment and calls the provided {@link SegmentConsumer}. + */ + public void finish(SegmentConsumer segmentConsumer) { + produceSegment(segmentConsumer); + reset(); + } + + @Override + public long ramBytesAllocated() { + // int: epsilon, numPointsInHull, lowerStart, upperStart + // double: firstKey, previousKey + // Point: rect[4], point1, point2 + // Slope: slope1, slope2, slopeTmp, slopeMin, slopeMax + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + 4 * Integer.BYTES + + 2 * Double.BYTES + + 6L * Point.RAM_BYTES_ALLOCATED + + lower.ramBytesAllocated() + + upper.ramBytesAllocated() + + 5L * Slope.RAM_BYTES_ALLOCATED; + } + + @Override + public long ramBytesUsed() { + return ramBytesAllocated(); + } + + private int addEpsilon(int index) { + try { + return Math.addExact(index, epsilon); + } catch (ArithmeticException e) { + return Integer.MAX_VALUE; + } + } + + private int subtractEpsilon(int index) { + try { + return Math.subtractExact(index, epsilon); + } catch (ArithmeticException e) { + return Integer.MIN_VALUE; + } + } + + private static double cross(Point o, Point a, Point b) { + return (a.x - o.x) * (b.y - o.y) - (a.y - o.y) * (b.x - o.x); + } + + /** Consumer notified when a new segment is built by the {@link PlaModel}. */ + public interface SegmentConsumer { + + /** + * Consumes a new segment. The segment is defined by the epsilon-approximation function fs(k) = + * k × slope + intercept. + * + * @param firstKey The first key of the segment. + * @param slope The segment slope. + * @param intercept The segment intercept. + */ + void accept(double firstKey, double slope, long intercept); + } + + /** Re-usable mutable (x,y) point. */ + private static class Point { + + static final int RAM_BYTES_ALLOCATED = + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + Double.BYTES + Long.BYTES; + + double x; + long y; + + Point set(double x, long y) { + this.x = x; + this.y = y; + return this; + } + + Point set(Point p) { + return set(p.x, p.y); + } + } + + /** List of mutable {@link Point}. Re-uses allocated points instead of creating new instances. */ + private static class PointList implements Accountable { + + Point[] points; + int size; + int numAllocated; + + PointList(int initialCapacity) { + points = new Point[initialCapacity]; + } + + void add(Point point) { + if (size == points.length) { + int newSize = + BoundedProportionalArraySizingStrategy.DEFAULT_INSTANCE.grow(points.length, size, 1); + points = Arrays.copyOf(points, newSize); + } + if (size == numAllocated) { + points[numAllocated++] = new Point(); + } + points[size++].set(point); + } + + Point get(int index) { + return points[index]; + } + + int size() { + return size; + } + + void clear() { + size = 0; + } + + void clearFrom(int end) { + size = end; + } + + @Override + public long ramBytesAllocated() { + // int: size, numAllocated + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + 2 * Integer.BYTES + + RamUsageEstimator.shallowSizeOfArray(points) + + (long) numAllocated * Point.RAM_BYTES_ALLOCATED; + } + + @Override + public long ramBytesUsed() { + return ramBytesAllocated(); + } + } + + /** Re-usable mutable (dx,dy) slope. */ + private static class Slope { + + static final int RAM_BYTES_ALLOCATED = + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + Double.BYTES + Long.BYTES; + + double dx; + long dy; + + void set(Slope s) { + dx = s.dx; + dy = s.dy; + } + + Slope set(Point p1, Point p2) { + dx = p2.x - p1.x; + dy = p2.y - p1.y; + return this; + } + + boolean isLessThan(Slope s) { + return dy * s.dx < dx * s.dy; + } + + boolean isGreaterThan(Slope s) { + return dy * s.dx > dx * s.dy; + } + + boolean isEqual(Slope s) { + return Double.doubleToLongBits(dy * s.dx) == Double.doubleToLongBits(dx * s.dy); + } + + static double asDouble(Point p1, Point p2) { + return (double) (p2.y - p1.y) / (p2.x - p1.x); + } + } +} diff --git a/hppc/src/main/java/com/carrotsearch/hppc/RamUsageEstimator.java b/hppc/src/main/java/com/carrotsearch/hppc/RamUsageEstimator.java index f93fd9284..5484caf1e 100644 --- a/hppc/src/main/java/com/carrotsearch/hppc/RamUsageEstimator.java +++ b/hppc/src/main/java/com/carrotsearch/hppc/RamUsageEstimator.java @@ -10,7 +10,6 @@ package com.carrotsearch.hppc; import java.lang.reflect.Array; -import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.Collections; import java.util.IdentityHashMap; @@ -59,12 +58,12 @@ private RamUsageEstimator() {} Map, Integer> primitiveSizesMap = new IdentityHashMap<>(); primitiveSizesMap.put(boolean.class, 1); primitiveSizesMap.put(byte.class, 1); - primitiveSizesMap.put(char.class, Integer.valueOf(Character.BYTES)); - primitiveSizesMap.put(short.class, Integer.valueOf(Short.BYTES)); - primitiveSizesMap.put(int.class, Integer.valueOf(Integer.BYTES)); - primitiveSizesMap.put(float.class, Integer.valueOf(Float.BYTES)); - primitiveSizesMap.put(double.class, Integer.valueOf(Double.BYTES)); - primitiveSizesMap.put(long.class, Integer.valueOf(Long.BYTES)); + primitiveSizesMap.put(char.class, Character.BYTES); + primitiveSizesMap.put(short.class, Short.BYTES); + primitiveSizesMap.put(int.class, Integer.BYTES); + primitiveSizesMap.put(float.class, Float.BYTES); + primitiveSizesMap.put(double.class, Double.BYTES); + primitiveSizesMap.put(long.class, Long.BYTES); primitiveSizes = Collections.unmodifiableMap(primitiveSizesMap); } @@ -76,7 +75,7 @@ private RamUsageEstimator() {} static final String OS_ARCH = System.getProperty("os.arch"); - /** Initialize constants and try to collect information about the JVM internals. */ + // Initialize constants and try to collect information about the JVM internals. static { boolean is64Bit = false; String datamodel = null; @@ -88,11 +87,7 @@ private RamUsageEstimator() {} } catch (SecurityException ex) { } if (datamodel == null) { - if (OS_ARCH != null && OS_ARCH.contains("64")) { - is64Bit = true; - } else { - is64Bit = false; - } + is64Bit = OS_ARCH != null && OS_ARCH.contains("64"); } JRE_IS_64BIT = is64Bit; if (JRE_IS_64BIT) { @@ -171,29 +166,6 @@ public static long shallowUsedSizeOfArray(Object array, int usedSize) { /** Return shallow size of any array. */ public static long shallowSizeOfArray(Object array) { - long size = NUM_BYTES_ARRAY_HEADER; - final int len = Array.getLength(array); - if (len > 0) { - Class arrayElementClazz = array.getClass().getComponentType(); - if (arrayElementClazz.isPrimitive()) { - size += (long) len * primitiveSizes.get(arrayElementClazz); - } else { - size += (long) NUM_BYTES_OBJECT_REF * len; - } - } - return alignObjectSize(size); - } - - /** - * This method returns the maximum representation size of an object. sizeSoFar is the - * object's size measured so far. f is the field being probed. - * - *

The returned offset will be the maximum of whatever was measured so far and f - * field's offset and representation size (unaligned). - */ - static long adjustForField(long sizeSoFar, final Field f) { - final Class type = f.getType(); - final int fsize = type.isPrimitive() ? primitiveSizes.get(type) : NUM_BYTES_OBJECT_REF; - return sizeSoFar + fsize; + return shallowUsedSizeOfArray(array, Array.getLength(array)); } } diff --git a/hppc/src/main/templates/com/carrotsearch/hppc/KTypePgmIndex.java b/hppc/src/main/templates/com/carrotsearch/hppc/KTypePgmIndex.java new file mode 100644 index 000000000..53c42ae01 --- /dev/null +++ b/hppc/src/main/templates/com/carrotsearch/hppc/KTypePgmIndex.java @@ -0,0 +1,589 @@ +/*! #set($TemplateOptions.ignored = ($TemplateOptions.isKTypeAnyOf("GENERIC", "BYTE", "SHORT", "CHAR"))) !*/ +package com.carrotsearch.hppc; + +import com.carrotsearch.hppc.cursors.KTypeCursor; +import com.carrotsearch.hppc.procedures.KTypeProcedure; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Space-efficient index that enables fast rank/range search operations on a sorted sequence + * of KType. + *

Implementation of the PGM-Index described at + * https://pgm.di.unipi.it/, based on the paper + *

+ *   Paolo Ferragina and Giorgio Vinciguerra.
+ *   The PGM-index: a fully-dynamic compressed learned index with provable worst-case bounds.
+ *   PVLDB, 13(8): 1162-1175, 2020.
+ * 
+ * It provides {@code rank} and {@code range} search operations. + * {@code indexOf()} is faster than B+Tree, and the index is much more compact. + * {@code contains()} is between 4x to 7x slower than {@code IntHashSet#contains()}, but + * between 2.5x to 3x faster than {@link Arrays#binarySearch}. + *

+ * Its compactness (40KB for 200MB of keys) makes it efficient for very large collections, + * the index fitting easily in the L2 cache. The {@code epsilon} parameter should be set + * according to the desired space-time trade-off. A smaller value makes the estimation more + * precise and the range smaller but at the cost of increased space usage. In practice, + * {@code epsilon} 64 is a good sweet spot. + *

+ * Internally the index uses an optimal piecewise linear mapping from keys to their position + * in the sorted order. This mapping is represented as a sequence of linear models (segments) + * which are themselves recursively indexed by other piecewise linear mappings. + */ +/*! ${TemplateOptions.generatedAnnotation} !*/ +/*! #if ($templateonly) !*/ @SuppressWarnings({"rawtypes", "unchecked"}) /*! #end !*/ +public class KTypePgmIndex implements Accountable { + + /** Empty immutable KTypePgmIndex. */ + public static final KTypePgmIndex EMPTY = new KTypeEmptyPgmIndex(); + + /** + * Epsilon approximation range when searching the list of keys. + * Controls the size of the returned search range, strictly greater than 0. + * It should be set according to the desired space-time trade-off. A smaller value makes the + * estimation more precise and the range smaller but at the cost of increased space usage. + *

+ * With EPSILON=64 the benchmark with 200MB of keys shows that this PGM index requires + * only 2% additional memory on average (40KB). It depends on the distribution of the keys. + * This epsilon value is good even for 2MB of keys. + * With EPSILON=32: +5% speed, but 4x space (160KB). + */ + public static final int EPSILON = 64; + /** + * Epsilon approximation range for the segments layers. + * Controls the size of the search range in the hierarchical segment lists, strictly greater than 0. + */ + public static final int EPSILON_RECURSIVE = 32; + /** Size of a key, measured in {@link Integer#BYTES} because the key is stored in an int[]. */ + public static final int KEY_SIZE = RamUsageEstimator.primitiveSizes + .get(/*! #if ($TemplateOptions.KTypeGeneric) !*/ Object /*! #else KType #end !*/.class) / Integer.BYTES; + /** 2x {@link #KEY_SIZE}. */ + public static final int DOUBLE_KEY_SIZE = KEY_SIZE * 2; + /** Data size of a segment, measured in {@link Integer#BYTES}, because segments are stored in an int[]. */ + public static final int SEGMENT_DATA_SIZE = KEY_SIZE * 3; + /** Initial value of the exponential jump when scanning out of the epsilon range. */ + public static final int BEYOND_EPSILON_JUMP = 16; + + /** The list of keys for which this index is built. It is sorted and may contain duplicate elements. */ + public final KTypeArrayList keys; + /** The size of the key set. That is, the number of distinct elements in {@link #keys}. */ + public final int size; + /** The lowest key in {@link #keys}. */ + public final KType firstKey; + /** The highest key in {@link #keys}. */ + public final KType lastKey; + /** The epsilon range used to build this index. */ + public final int epsilon; + /** The recursive epsilon range used to build this index. */ + public final int epsilonRecursive; + /** The offsets in {@link #segmentData} of the first segment of each segment level. */ + public final int[] levelOffsets; + /** The index data. It contains all the segments for all the levels. */ + public final int[] segmentData; + + private KTypePgmIndex(KTypeArrayList keys, + int size, + int epsilon, + int epsilonRecursive, + int[] levelOffsets, + int[] segmentData) { + assert keys.size() > 0; + assert size > 0 && size <= keys.size(); + assert epsilon > 0; + assert epsilonRecursive > 0; + this.keys = keys; + this.size = size; + firstKey = keys.get(0); + lastKey = keys.get(keys.size() - 1); + this.epsilon = epsilon; + this.epsilonRecursive = epsilonRecursive; + this.levelOffsets = levelOffsets; + this.segmentData = segmentData; + } + + /** Empty set constructor. */ + private KTypePgmIndex() { + keys = new KTypeArrayList(0); + size = 0; + firstKey = Intrinsics.empty(); + lastKey = Intrinsics.empty(); + epsilon = 0; + epsilonRecursive = 0; + levelOffsets = new int[0]; + segmentData = levelOffsets; + } + + /** Returns the size of the key set. That is, the number of distinct elements in {@link #keys}. */ + public int size() { + return size; + } + + /** Returns whether this key set is empty. */ + public boolean isEmpty() { + return size() == 0; + } + + /** Returns whether this key set contains the given key. */ + public boolean contains(KType key) { + return indexOf(key) >= 0; + } + + /** + * Searches the specified key, and returns its index in the element list. + * If multiple elements are equal to the specified key, there is no + * guarantee which one will be found. + * @return The index of the searched key if it is present; + * otherwise, {@code (-(insertion point) - 1)}. The + * insertion point is defined as the point at which the + * key would be inserted into the list: the index of the first + * element greater than the key, or {@link #keys}#{@code size()} + * if all the elements are less than the specified key. Note that + * this guarantees that the return value will be >= 0 if and + * only if the key is found. + */ + public int indexOf(KType key) { + if (Intrinsics.numeric(key) < Intrinsics.numeric(firstKey)) { + return -1; + } + if (Intrinsics.numeric(key) > Intrinsics.numeric(lastKey)) { + return -keys.size() - 1; + } + final int[] segmentData = this.segmentData; + int segmentDataIndex = findSegment(key); + int nextIntercept = (int) getIntercept(segmentDataIndex + SEGMENT_DATA_SIZE, segmentData); + int index = Math.min(approximateIndex(key, segmentDataIndex, segmentData), Math.min(nextIntercept, keys.size() - 1)); + assert index >= 0 && index < keys.size(); + KType k = keys.get(index); + if (Intrinsics.numeric(key) < Intrinsics.numeric(k)) { + // Scan sequentially before the approximated index, within epsilon range. + final int fromIndex = Math.max(index - epsilon - 1, 0); + while (--index >= fromIndex) { + k = keys.get(index); + if (Intrinsics.numeric(key) > Intrinsics.numeric(k)) { + return -index - 2; + } + if (Intrinsics.equals(key, k)) { + return index; + } + } + // Continue scanning out of the epsilon range. + // This might happen in rare cases of precision error during the approximation + // computation for longs (we don't have long double 128 bits in Java). + // This might also happen in rare corner cases of large duplicate elements + // sequence at the epsilon range boundary. + index++; + int jump = BEYOND_EPSILON_JUMP; + do { + int loIndex = Math.max(index - jump, 0); + if (Intrinsics.numeric(key) >= Intrinsics.numeric(keys.get(loIndex))) { + return Arrays.binarySearch(keys.buffer, loIndex, index, key); + } + index = loIndex; + jump <<= 1; + } while (index > 0); + return -1; + } else if (Intrinsics.equals(key, k)) { + return index; + } else { + // Scan sequentially after the approximated index, within epsilon range. + final int toIndex = Math.min(index + epsilon + 3, keys.size()); + while (++index < toIndex) { + k = keys.get(index); + if (Intrinsics.numeric(key) < Intrinsics.numeric(k)) { + return -index - 1; + } + if (Intrinsics.equals(key, k)) { + return index; + } + } + // Continue scanning out of the epsilon range. + int jump = BEYOND_EPSILON_JUMP; + do { + int hiIndex = Math.min(index + jump, keys.size()); + if (Intrinsics.numeric(key) <= Intrinsics.numeric(keys.get(hiIndex))) { + return Arrays.binarySearch(keys.buffer, index, hiIndex, key); + } + index = hiIndex; + jump <<= 1; + } while (index < keys.size()); + return -keys.size() - 1; + } + } + + /** + * Returns, for any value {@code x}, the number of keys in the sorted list + * which are smaller than {@code x}. + * It is equal to {@link #indexOf} if {@code x} belongs to the list, + * or -{@link #indexOf}-1 otherwise. + * + *

If multiple elements are equal to the specified key, there is no + * guarantee which one will be found. + * + * @return The index of the searched key if it is present; + * otherwise, the {@code insertion point}. The + * insertion point is defined as the point at which the + * key would be inserted into the list: the index of the first + * element greater than the key, or {@link #keys}#{@code size()} + * if all the elements are less than the specified key. Note that + * this method always returns a value >= 0. + */ + public int rank(KType x) { + int index = indexOf(x); + return index >= 0 ? index : -index - 1; + } + + /** + * Returns the number of keys in the list that are greater than or equal to + * {@code minKey} (inclusive), and less than or equal to {@code maxKey} (inclusive). + */ + public int rangeCardinality(KType minKey, KType maxKey) { + int fromIndex = rank(minKey); + int maxIndex = indexOf(maxKey); + int toIndex = maxIndex >= 0 ? maxIndex + 1 : -maxIndex - 1; + return Math.max(toIndex - fromIndex, 0); + } + + /** + * Returns an iterator over the keys in the list that are greater than or equal to + * {@code minKey} (inclusive), and less than or equal to {@code maxKey} (inclusive). + */ + public Iterator> rangeIterator(KType minKey, KType maxKey) { + int fromIndex = rank(minKey); + return new RangeIterator(keys, fromIndex, maxKey); + } + + /** + * Applies {@code procedure} to the keys in the list that are greater than or equal + * to {@code minKey} (inclusive), and less than or equal to {@code maxKey} (inclusive). + */ + public > T forEachInRange(T procedure, KType minKey, KType maxKey) { + final KType [] buffer = Intrinsics.cast(keys.buffer); + KType k; + for (int i = rank(minKey), size = keys.size(); i < size && Intrinsics.numeric((k = buffer[i])) <= Intrinsics.numeric(maxKey); i++) { + procedure.apply(k); + } + return procedure; + } + + /** + * Estimates the allocated memory. + * It does not count the memory for the list of keys, only for the index itself. + */ + @Override + public long ramBytesAllocated() { + // int: size, epsilon, epsilonRecursive + // KType: firstKey, lastKey + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 3 * Integer.BYTES + + 2L * KEY_SIZE * Integer.BYTES + //+ keys.ramBytesAllocated() + + RamUsageEstimator.shallowSizeOfArray(levelOffsets) + + RamUsageEstimator.shallowSizeOfArray(segmentData); + } + + /** + * Estimates the bytes that are actually used. + * It does not count the memory for the list of keys, only for the index itself. + */ + @Override + public long ramBytesUsed() { + // int: size, epsilon, epsilonRecursive + // KType: firstKey, lastKey + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 3 * Integer.BYTES + + 2L * KEY_SIZE * Integer.BYTES + //+ keys.ramBytesUsed() + + RamUsageEstimator.shallowSizeOfArray(levelOffsets) + + RamUsageEstimator.shallowSizeOfArray(segmentData); + } + + /** + * Finds the segment responsible for a given key, that is, + * the rightmost segment having its first key <= the searched key. + * @return the segment data index; or -1 if none. + */ + private int findSegment(KType key) { + assert Intrinsics.numeric(key) >= Intrinsics.numeric(firstKey) && Intrinsics.numeric(key) <= Intrinsics.numeric(lastKey); + final int epsilonRecursive = this.epsilonRecursive; + final int[] levelOffsets = this.levelOffsets; + final int[] segmentData = this.segmentData; + int level = levelOffsets.length - 1; + int segmentDataIndex = levelOffsets[level] * SEGMENT_DATA_SIZE; + while (--level >= 0) { + int nextIntercept = (int) getIntercept(segmentDataIndex + SEGMENT_DATA_SIZE, segmentData); + int index = Math.min(approximateIndex(key, segmentDataIndex, segmentData), nextIntercept); + assert index >= 0 && index <= levelOffsets[level + 1] - levelOffsets[level] - 1; + int sdIndex = (levelOffsets[level] + index) * SEGMENT_DATA_SIZE; + if (Intrinsics.numeric(getKey(sdIndex, segmentData)) <= Intrinsics.numeric(key)) { + // Scan sequentially segments after the approximated index, within the epsilon range. + final int levelNumSegments = levelOffsets[level + 1] - levelOffsets[level] - 1; + final int toIndex = Math.min(index + epsilonRecursive + 3, levelNumSegments); + while (index++ < toIndex + && Intrinsics.numeric(getKey(sdIndex + SEGMENT_DATA_SIZE, segmentData)) <= Intrinsics.numeric(key)) { + sdIndex += SEGMENT_DATA_SIZE; + } + } else { + // Scan sequentially segments before the approximated index, within the epsilon range. + final int fromIndex = Math.max(index - epsilonRecursive - 1, 0); + while (index-- > fromIndex) { + sdIndex -= SEGMENT_DATA_SIZE; + if (Intrinsics.numeric(getKey(sdIndex, segmentData)) <= Intrinsics.numeric(key)) { + break; + } + } + } + segmentDataIndex = sdIndex; + } + assert segmentDataIndex >= 0; + return segmentDataIndex; + } + + private int approximateIndex(KType key, int segmentDataIndex, int[] segmentData) { + long intercept = getIntercept(segmentDataIndex, segmentData); + KType sKey = getKey(segmentDataIndex, segmentData); + double slope = getSlope(segmentDataIndex, segmentData); + int index = (int) (slope * ((double) Intrinsics.numeric(key) - Intrinsics.numeric(sKey)) + intercept); + return Math.max(index, 0); + } + + private static long getIntercept(int segmentDataIndex, int[] segmentData) { + return PgmIndexUtil.getIntercept(segmentDataIndex, segmentData, KEY_SIZE); + } + + private KType getKey(int segmentDataIndex, int[] segmentData) { + return PgmIndexUtil.getKey(segmentDataIndex + KEY_SIZE, segmentData, Intrinsics.empty()); + } + + private static double getSlope(int segmentDataIndex, int[] segmentData) { + return PgmIndexUtil.getSlope(segmentDataIndex + DOUBLE_KEY_SIZE, segmentData, KEY_SIZE); + } + + /** Empty immutable PGM Index. */ + private static class KTypeEmptyPgmIndex extends KTypePgmIndex { + + private final Iterator> emptyIterator = new KTypeEmptyIterator(); + + @Override + public int indexOf(KType key) { + return -1; + } + + @Override + public Iterator> rangeIterator(KType minKey, KType maxKey) { + return emptyIterator; + } + + @Override + public > T forEachInRange(T procedure, KType minKey, KType maxKey) { + return procedure; + } + + private static class KTypeEmptyIterator extends AbstractIterator> { + @Override + protected KTypeCursor fetch() { + return done(); + } + } + } + + /** + * Iterator over a range of elements in a sorted array. + */ + protected static class RangeIterator extends AbstractIterator> { + private final KType[] buffer; + private final int size; + private final KTypeCursor cursor; + private final KType maxKey; + + /** + * Range iterator from {@code fromIndex} (inclusive) to {@code maxKey} (inclusive). + */ + protected RangeIterator(KTypeArrayList keys, int fromIndex, KType maxKey) { + this.buffer = Intrinsics.cast(keys.buffer); + this.size = keys.size(); + this.cursor = new KTypeCursor(); + this.cursor.index = fromIndex; + this.maxKey = maxKey; + } + + @Override + protected KTypeCursor fetch() { + if (cursor.index >= size) { + return done(); + } + cursor.value = buffer[cursor.index++]; + if (Intrinsics.numeric(cursor.value) > Intrinsics.numeric(maxKey)) { + cursor.index = size; + return done(); + } + return cursor; + } + } + + /** Builds a {@link KTypePgmIndex} on a provided sorted list of keys. */ + /*! #if ($templateonly) !*/ @SuppressWarnings({"unchecked"}) /*! #end !*/ + public static class KTypeBuilder implements PlaModel.SegmentConsumer, Accountable { + + protected KTypeArrayList keys; + protected int epsilon = EPSILON; + protected int epsilonRecursive = EPSILON_RECURSIVE; + protected PlaModel plam; + protected int size; + protected IntGrowableArray segmentData; + protected int numSegments; + + /** Sets the sorted list of keys to build the index for; duplicate elements are allowed. */ + public KTypeBuilder setSortedKeys(KTypeArrayList keys) { + this.keys = keys; + return this; + } + + /** Sets the sorted array of keys to build the index for; duplicate elements are allowed. */ + public KTypeBuilder setSortedKeys(KType[] keys, int length) { + KTypeArrayList keyList = new KTypeArrayList(0); + keyList.buffer = keys; + keyList.elementsCount = length; + return setSortedKeys(keyList); + } + + /** Sets the epsilon range to use when learning the segments for the list of keys. */ + public KTypeBuilder setEpsilon(int epsilon) { + if (epsilon <= 0) { + throw new IllegalArgumentException("epsilon must be > 0"); + } + this.epsilon = epsilon; + return this; + } + + /** Sets the recursive epsilon range to use when learning the segments for the segment levels. */ + public KTypeBuilder setEpsilonRecursive(int epsilonRecursive) { + if (epsilonRecursive <= 0) { + throw new IllegalArgumentException("epsilonRecursive must be > 0"); + } + this.epsilonRecursive = epsilonRecursive; + return this; + } + + /** Builds the {@link KTypePgmIndex}; or {@link #EMPTY} if there are no keys in the list. */ + public KTypePgmIndex build() { + if (keys == null || keys.size() == 0) { + return (KTypePgmIndex) EMPTY; + } + plam = new PlaModel(epsilon); + + int segmentsInitialCapacity = Math.min(Math.max(keys.size() / (2 * epsilon * epsilon) * SEGMENT_DATA_SIZE, 16), 1 << 19); + segmentData = new IntGrowableArray(segmentsInitialCapacity); + IntGrowableArray levelOffsets = new IntGrowableArray(16); + + int levelOffset = 0; + levelOffsets.add(levelOffset); + int levelNumSegments = buildFirstLevel(); + while (levelNumSegments > 1) { + int nextLevelOffset = numSegments; + levelOffsets.add(nextLevelOffset); + levelNumSegments = buildUpperLevel(levelOffset, levelNumSegments); + levelOffset = nextLevelOffset; + } + + int[] segmentDataFinal = segmentData.toArray(); + int[] levelOffsetsFinal = levelOffsets.toArray(); + return new KTypePgmIndex(keys, + size, + epsilon, + epsilonRecursive, + levelOffsetsFinal, + segmentDataFinal); + } + + private int buildFirstLevel() { + assert numSegments == 0; + int numKeys = keys.size(); + int size = 0; + KType key = keys.get(0); + size++; + plam.addKey(Intrinsics.numeric(key), 0, this); + for (int i = 1; i < numKeys; i++) { + KType nextKey = keys.get(i); + if (!Intrinsics.equals(nextKey, key)) { + key = nextKey; + plam.addKey(Intrinsics.numeric(key), i, this); + size++; + } + } + plam.finish(this); + addSentinelSegment(numKeys); + this.size = size; + return numSegments - 1; + } + + private int buildUpperLevel(int levelOffset, int levelNumSegments) { + plam.setEpsilon(epsilonRecursive); + assert numSegments > 0; + int initialNumSegments = numSegments; + int segmentDataIndex = levelOffset * SEGMENT_DATA_SIZE; + KType key = getKey(segmentDataIndex, segmentData.buffer); + plam.addKey(Intrinsics.numeric(key), 0, this); + for (int i = 1; i < levelNumSegments; i++) { + segmentDataIndex += SEGMENT_DATA_SIZE; + KType nextKey = getKey(segmentDataIndex, segmentData.buffer); + if (!Intrinsics.equals(nextKey, key)) { + key = nextKey; + plam.addKey(Intrinsics.numeric(key), i, this); + } + } + plam.finish(this); + addSentinelSegment(levelNumSegments); + return numSegments - initialNumSegments - 1; + } + + private KType getKey(int segmentDataIndex, int[] segmentData) { + return PgmIndexUtil.getKey(segmentDataIndex + KEY_SIZE, segmentData, Intrinsics.empty()); + } + + /** + * Adds a sentinel segment that is used to give a limit for the position approximation, + * but does not count in the number of segments per level. + */ + private void addSentinelSegment(int endIndex) { + // This sentinel segment is used in findSegment(). + accept(Double.MAX_VALUE, 0d, endIndex); + } + + @Override + public void accept(double firstKey, double slope, long intercept) { + PgmIndexUtil.addIntercept(intercept, segmentData, KEY_SIZE); + PgmIndexUtil.addKey((KType)Intrinsics.cast(firstKey), segmentData); + PgmIndexUtil.addSlope(slope, segmentData, KEY_SIZE); + numSegments++; + assert segmentData.size == numSegments * SEGMENT_DATA_SIZE; + } + + /** + * Estimates the allocated memory. + * It does not count the memory for the list of keys, only for the builder itself. + */ + @Override + public long ramBytesAllocated() { + // int: epsilon, epsilonRecursive, size, numSegments + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + 4 * Integer.BYTES + //+ keys.ramBytesAllocated() + + plam.ramBytesAllocated() + + segmentData.ramBytesAllocated(); + } + + /** + * Estimates the bytes that are actually used. + * It does not count the memory for the list of keys, only for the builder itself. + */ + @Override + public long ramBytesUsed() { + // int: epsilon, epsilonRecursive, size, numSegments + return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + 4 * Integer.BYTES + //+ keys.ramBytesUsed() + + plam.ramBytesUsed() + + segmentData.ramBytesUsed(); + } + } +} \ No newline at end of file diff --git a/hppc/src/test/templates/com/carrotsearch/hppc/KTypePgmIndexTest.java b/hppc/src/test/templates/com/carrotsearch/hppc/KTypePgmIndexTest.java new file mode 100644 index 000000000..1892e40ce --- /dev/null +++ b/hppc/src/test/templates/com/carrotsearch/hppc/KTypePgmIndexTest.java @@ -0,0 +1,199 @@ +/*! #set($TemplateOptions.ignored = ($TemplateOptions.isKTypeAnyOf("GENERIC", "BYTE", "SHORT", "CHAR"))) !*/ +package com.carrotsearch.hppc; + +import com.carrotsearch.hppc.cursors.KTypeCursor; +import com.carrotsearch.hppc.procedures.KTypeProcedure; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Tests {@link KTypePgmIndex}. + */ +/*! ${TemplateOptions.generatedAnnotation} !*/ +public class KTypePgmIndexTest extends AbstractKTypeTest { + + /*! #if ($TemplateOptions.KTypeGeneric) !*/ @Ignore /*! #end !*/ + @Test + public void testSanityOneSegmentLevel() { + KType[] keys = asArray(2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034); + KTypeArrayList keyList = new KTypeArrayList<>(); + keyList.add(keys, 0, keys.length); + KTypePgmIndex.KTypeBuilder builder = new KTypePgmIndex.KTypeBuilder() + .setSortedKeys(keyList) + .setEpsilon(4) + .setEpsilonRecursive(2); + KTypePgmIndex pgmIndex = builder.build(); + assertEquals(keys.length, pgmIndex.size()); + for (KType key : keys) { + assertTrue(pgmIndex.contains(key)); + } + assertFalse(pgmIndex.contains(cast(1))); + assertFalse(pgmIndex.contains(cast(116))); + assertFalse(pgmIndex.contains(cast(120))); + assertFalse(pgmIndex.contains(cast(1190))); + assertFalse(pgmIndex.contains(cast(1192))); + assertFalse(pgmIndex.contains(cast(1200))); + assertFalse(pgmIndex.contains(cast(2000))); + assertFalse(pgmIndex.contains(cast(4031))); + System.out.println("pgmIndex.ramBytesAllocated() = " + pgmIndex.ramBytesAllocated() + " B"); + System.out.println("pgmIndex.ramBytesUsed() = " + pgmIndex.ramBytesUsed() + " B"); + System.out.println("builder.ramBytesAllocated() = " + builder.ramBytesAllocated() + " B"); + } + + /*! #if ($TemplateOptions.KTypeGeneric) !*/ @Ignore /*! #end !*/ + @Test + public void testSanityTwoSegmentLevels() { + KType[] keys = asArray(2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034, 4035, 4036, 4037, 4039, 4900); + KTypePgmIndex.KTypeBuilder builder = new KTypePgmIndex.KTypeBuilder() + .setSortedKeys(keys, keys.length) + .setEpsilon(1) + .setEpsilonRecursive(1); + KTypePgmIndex pgmIndex = builder.build(); + assertEquals(keys.length, pgmIndex.size()); + for (KType key : keys) { + assertTrue(pgmIndex.contains(key)); + } + System.out.println("pgmIndex.ramBytesAllocated() = " + pgmIndex.ramBytesAllocated() + " B"); + System.out.println("pgmIndex.ramBytesUsed() = " + pgmIndex.ramBytesUsed() + " B"); + System.out.println("builder.ramBytesAllocated() = " + builder.ramBytesAllocated() + " B"); + } + + /*! #if ($TemplateOptions.KTypeGeneric) !*/ @Ignore /*! #end !*/ + @Test + public void testRangeIterator() { + KType[] keys = asArray(2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034, 4035, 4036, 4037, 4039, 4900); + KTypePgmIndex.KTypeBuilder builder = new KTypePgmIndex.KTypeBuilder() + .setSortedKeys(keys, keys.length) + .setEpsilon(1) + .setEpsilonRecursive(1); + KTypePgmIndex pgmIndex = builder.build(); + assertIterator(123, 1191, pgmIndex, 123, 1024, 1129, 1191); + assertIterator(1100, 1300, pgmIndex, 1129, 1191, 1201); + assertIterator(-1, 100, pgmIndex, 2, 12); + assertIterator(Integer.MIN_VALUE, 100, pgmIndex, 2, 12); + assertIterator(Integer.MIN_VALUE, Integer.MAX_VALUE, pgmIndex, 2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034, 4035, 4036, 4037, 4039, 4900); + assertIterator(4036, Integer.MAX_VALUE, pgmIndex, 4036, 4037, 4039, 4900); + assertIterator(4039, 4500, pgmIndex, 4039); + assertIterator(4040, 4500, pgmIndex); + } + + private void assertIterator(int minKey, int maxKey, KTypePgmIndex pgmIndex, int... expectedKeys) { + Iterator> iterator = pgmIndex.rangeIterator(cast(minKey), cast(maxKey)); + for (int expectedKey : expectedKeys) { + if (randomBoolean()) { + assertTrue(iterator.hasNext()); + } + assertTrue(Intrinsics.equals(cast(expectedKey), iterator.next().value)); + } + assertFalse(iterator.hasNext()); + assertEquals(expectedKeys.length, pgmIndex.rangeCardinality(cast(minKey), cast(maxKey))); + } + + /*! #if ($TemplateOptions.KTypeGeneric) !*/ @Ignore /*! #end !*/ + @Test + public void testRangeProcedure() { + KType[] keys = asArray(2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034, 4035, 4036, 4037, 4039, 4900); + KTypePgmIndex.KTypeBuilder builder = new KTypePgmIndex.KTypeBuilder() + .setSortedKeys(keys, keys.length) + .setEpsilon(1) + .setEpsilonRecursive(1); + KTypePgmIndex pgmIndex = builder.build(); + assertProcedure(123, 1191, pgmIndex, 123, 1024, 1129, 1191); + assertProcedure(1100, 1300, pgmIndex, 1129, 1191, 1201); + assertProcedure(-1, 100, pgmIndex, 2, 12); + assertProcedure(Integer.MIN_VALUE, 100, pgmIndex, 2, 12); + assertProcedure(Integer.MIN_VALUE, Integer.MAX_VALUE, pgmIndex, 2, 12, 115, 118, 123, 1024, 1129, 1191, 1201, 4034, 4035, 4036, 4037, 4039, 4900); + assertProcedure(4036, Integer.MAX_VALUE, pgmIndex, 4036, 4037, 4039, 4900); + assertProcedure(4039, 4500, pgmIndex, 4039); + assertProcedure(4040, 4500, pgmIndex); + } + + private void assertProcedure(int minKey, int maxKey, KTypePgmIndex pgmIndex, int... expectedKeys) { + KTypeArrayList processedKeys = new KTypeArrayList(); + KTypeProcedure procedure = new KTypeProcedure() { + @Override + public void apply(KType key) { + processedKeys.add(key); + } + }; + pgmIndex.forEachInRange(procedure, cast(minKey), cast(maxKey)); + assertEquals(KTypeArrayList.from(asArray(expectedKeys)), processedKeys); + } + + /*! #if ($TemplateOptions.KTypeGeneric) !*/ @Ignore /*! #end !*/ + @Test + public void testAgainstHashSet() { + final Random random = RandomizedTest.getRandom(); + for (int i = 0; i < 1; i++) { + //System.out.println("Loop " + i); + + KType[] additions = Intrinsics.newArray(1_000_000); + for (int j = 0; j < additions.length; j++) { + additions[j] = Intrinsics.cast( + /*! #if ($TemplateOptions.isKTypeAnyOf("INT")) !*/ random.nextInt() /*! #end !*/ + /*! #if ($TemplateOptions.isKTypeAnyOf("LONG")) random.nextLong() #end !*/ + /*! #if ($TemplateOptions.isKTypeAnyOf("FLOAT")) random.nextFloat() * random.nextInt() #end !*/ + /*! #if ($TemplateOptions.isKTypeAnyOf("DOUBLE")) random.nextDouble() * random.nextLong() #end !*/ + ); + } + Arrays.sort(additions); + // Make sure there is at least one sequence of duplicate keys. + int originalKeyIndex = random.nextInt(100_000); + for (int j = 0, numDups = random.nextInt(1_000) + 1; j < numDups; j++) { + additions[originalKeyIndex + j + 1] = additions[originalKeyIndex]; + } + + KTypePgmIndex.KTypeBuilder builder = + new KTypePgmIndex.KTypeBuilder() + .setSortedKeys(additions, additions.length); + if (random.nextBoolean()) { + builder.setEpsilon(random.nextInt(128) + 1); + builder.setEpsilonRecursive(random.nextInt(16) + 1); + } + KTypePgmIndex pgmIndex = builder.build(); + + Set hashSet = new HashSet<>(); + for (KType addition : additions) { + hashSet.add(addition); + } + + assertEquals(hashSet.size(), pgmIndex.size()); + for (int j = 0; j < additions.length; j++) { + assertTrue(String.valueOf(j), pgmIndex.contains(additions[j])); + assertTrue(Intrinsics.equals(additions[j], additions[pgmIndex.indexOf(additions[j])])); + } + random.ints(1_000_000).forEach((key) -> { + assertEquals(String.valueOf(key), hashSet.contains(cast(key)), pgmIndex.contains(cast(key))); + int index = pgmIndex.indexOf(cast(key)); + if (hashSet.contains(cast(key))) { + assertTrue(Intrinsics.equals(key, additions[index])); + } else { + int insertionIndex = -index - 1; + assertTrue(insertionIndex >= 0); + assertTrue(insertionIndex <= additions.length); + if (insertionIndex < additions.length) { + assertTrue(String.valueOf(key), Intrinsics.numeric(additions[insertionIndex]) > key); + } + if (insertionIndex > 0) { + assertTrue(String.valueOf(key), Intrinsics.numeric(additions[insertionIndex - 1]) < key); + } + } + }); + + System.out.println("pgmIndex.ramBytesAllocated() = " + pgmIndex.ramBytesAllocated() + " B"); + System.out.println("pgmIndex.ramBytesUsed() = " + pgmIndex.ramBytesUsed() + " B"); + System.out.println("builder.ramBytesAllocated() = " + builder.ramBytesAllocated() + " B"); + } + } +} diff --git a/versions.lock b/versions.lock index 2fc9fdf31..c39281e00 100644 --- a/versions.lock +++ b/versions.lock @@ -22,9 +22,9 @@ net.sf.jopt-simple:jopt-simple:4.6 (1 constraints: 610a91b7) org.apache.commons:commons-math3:3.2 (1 constraints: 5c0a8ab7) org.assertj:assertj-core:3.21.0 (1 constraints: 38053c3b) org.hamcrest:hamcrest-core:1.3 (1 constraints: cc05fe3f) -org.openjdk.jmh:jmh-core:1.24 (5 constraints: 3847a3da) -org.openjdk.jmh:jmh-generator-annprocess:1.24 (1 constraints: db04f730) -org.openjdk.jmh:jmh-generator-asm:1.24 (1 constraints: 28107098) -org.openjdk.jmh:jmh-generator-bytecode:1.24 (1 constraints: d804f430) -org.openjdk.jmh:jmh-generator-reflection:1.24 (2 constraints: 411e6e63) +org.openjdk.jmh:jmh-core:1.25 (5 constraints: 37454039) +org.openjdk.jmh:jmh-generator-annprocess:1.25 (1 constraints: dc04f830) +org.openjdk.jmh:jmh-generator-asm:1.25 (1 constraints: 29107198) +org.openjdk.jmh:jmh-generator-bytecode:1.25 (1 constraints: dc04f830) +org.openjdk.jmh:jmh-generator-reflection:1.25 (2 constraints: 431e9e63) org.ow2.asm:asm:5.0.3 (1 constraints: 490ea250) diff --git a/versions.props b/versions.props index 32c92206c..fc0beefb7 100644 --- a/versions.props +++ b/versions.props @@ -5,4 +5,4 @@ com.carrotsearch.console:*=1.0.6 com.carrotsearch.randomizedtesting:*=2.7.9 org.assertj:*=3.21.0 -org.openjdk.jmh:*=1.24 \ No newline at end of file +org.openjdk.jmh:*=1.25 \ No newline at end of file