Skip to content

Commit 1c93c16

Browse files
authored
Merge pull request RustPython#5164 from dchiquito/frozenset-hash
Use CPython hash algorithm for frozenset
2 parents ec9f2ce + 074d228 commit 1c93c16

File tree

7 files changed

+42
-28
lines changed

7 files changed

+42
-28
lines changed

Lib/test/test_collections.py

-2
Original file line numberDiff line numberDiff line change
@@ -1832,8 +1832,6 @@ def __repr__(self):
18321832
self.assertTrue(f1 != l1)
18331833
self.assertTrue(f1 != l2)
18341834

1835-
# TODO: RUSTPYTHON
1836-
@unittest.expectedFailure
18371835
def test_Set_hash_matches_frozenset(self):
18381836
sets = [
18391837
{}, {1}, {None}, {-1}, {0.0}, {"abc"}, {1, 2, 3},

Lib/test/test_set.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -707,8 +707,6 @@ def test_hash_caching(self):
707707
f = self.thetype('abcdcda')
708708
self.assertEqual(hash(f), hash(f))
709709

710-
# TODO: RUSTPYTHON
711-
@unittest.expectedFailure
712710
def test_hash_effectiveness(self):
713711
n = 13
714712
hashvalues = set()
@@ -730,7 +728,15 @@ def powerset(s):
730728
for i in range(len(s)+1):
731729
yield from map(frozenset, itertools.combinations(s, i))
732730

733-
for n in range(18):
731+
# TODO: RUSTPYTHON
732+
# The original test has:
733+
# for n in range(18):
734+
# Due to general performance overhead, hashing a frozenset takes
735+
# about 50 times longer than in CPython. This test amplifies that
736+
# exponentially, so the best we can do here reasonably is 13.
737+
# Even if the internal hash function did nothing, it would still be
738+
# about 40 times slower than CPython.
739+
for n in range(13):
734740
t = 2 ** n
735741
mask = t - 1
736742
for nums in (range, zf_range):

benches/microbenchmarks/frozenset.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fs = frozenset(range(0, ITERATIONS))
2+
3+
# ---
4+
5+
hash(fs)

common/src/hash.rs

-14
Original file line numberDiff line numberDiff line change
@@ -130,20 +130,6 @@ pub fn hash_float(value: f64) -> Option<PyHash> {
130130
Some(fix_sentinel(x as PyHash * value.signum() as PyHash))
131131
}
132132

133-
pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
134-
where
135-
I: IntoIterator<Item = &'a T>,
136-
F: Fn(&'a T) -> Result<PyHash, E>,
137-
{
138-
let mut hash: PyHash = 0;
139-
for element in iter {
140-
let item_hash = hashf(element)?;
141-
// xor is commutative and hash should be independent of order
142-
hash ^= item_hash;
143-
}
144-
Ok(fix_sentinel(mod_int(hash)))
145-
}
146-
147133
pub fn hash_bigint(value: &BigInt) -> PyHash {
148134
let ret = match value.to_i64() {
149135
Some(i) => mod_int(i),

vm/src/builtins/set.rs

+22-1
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,28 @@ impl PySetInner {
434434
}
435435

436436
fn hash(&self, vm: &VirtualMachine) -> PyResult<PyHash> {
437-
crate::utils::hash_iter_unordered(self.elements().iter(), vm)
437+
// Work to increase the bit dispersion for closely spaced hash values.
438+
// This is important because some use cases have many combinations of a
439+
// small number of elements with nearby hashes so that many distinct
440+
// combinations collapse to only a handful of distinct hash values.
441+
fn _shuffle_bits(h: u64) -> u64 {
442+
((h ^ 89869747) ^ (h.wrapping_shl(16))).wrapping_mul(3644798167)
443+
}
444+
// Factor in the number of active entries
445+
let mut hash: u64 = (self.elements().len() as u64 + 1).wrapping_mul(1927868237);
446+
// Xor-in shuffled bits from every entry's hash field because xor is
447+
// commutative and a frozenset hash should be independent of order.
448+
for element in self.elements().iter() {
449+
hash ^= _shuffle_bits(element.hash(vm)? as u64);
450+
}
451+
// Disperse patterns arising in nested frozensets
452+
hash ^= (hash >> 11) ^ (hash >> 25);
453+
hash = hash.wrapping_mul(69069).wrapping_add(907133923);
454+
// -1 is reserved as an error code
455+
if hash == u64::MAX {
456+
hash = 590923713;
457+
}
458+
Ok(hash as PyHash)
438459
}
439460

440461
// Run operation, on failure, if item is a set/set subclass, convert it

vm/src/types/slot.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use crate::{
1515
AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
1616
};
1717
use crossbeam_utils::atomic::AtomicCell;
18+
use malachite_bigint::BigInt;
1819
use num_traits::{Signed, ToPrimitive};
1920
use std::{borrow::Borrow, cmp::Ordering, ops::Deref};
2021

@@ -254,7 +255,11 @@ fn hash_wrapper(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<PyHash> {
254255
let py_int = hash_obj
255256
.payload_if_subclass::<PyInt>(vm)
256257
.ok_or_else(|| vm.new_type_error("__hash__ method should return an integer".to_owned()))?;
257-
Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint()))
258+
let big_int = py_int.as_bigint();
259+
let hash: PyHash = big_int
260+
.to_i64()
261+
.unwrap_or_else(|| (big_int % BigInt::from(u64::MAX)).to_i64().unwrap());
262+
Ok(hash)
258263
}
259264

260265
/// Marks a type as unhashable. Similar to PyObject_HashNotImplemented in CPython

vm/src/utils.rs

-7
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,6 @@ pub fn hash_iter<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
1111
vm.state.hash_secret.hash_iter(iter, |obj| obj.hash(vm))
1212
}
1313

14-
pub fn hash_iter_unordered<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
15-
iter: I,
16-
vm: &VirtualMachine,
17-
) -> PyResult<rustpython_common::hash::PyHash> {
18-
rustpython_common::hash::hash_iter_unordered(iter, |obj| obj.hash(vm))
19-
}
20-
2114
impl ToPyObject for std::convert::Infallible {
2215
fn to_pyobject(self, _vm: &VirtualMachine) -> PyObjectRef {
2316
match self {}

0 commit comments

Comments
 (0)