From ab448588542701b3d305da10e94a6069c88ce897 Mon Sep 17 00:00:00 2001
From: tbordaz <tbordaz@redhat.com>
Date: Wed, 22 May 2024 11:29:05 +0200
Subject: [PATCH] Issue 6172 - RFE: improve the performance of evaluation of
 filter component when tested against a large valueset (like group members)
 (#6173)

Bug description:
	Before returning an entry (to a SRCH) the server checks that the entry matches the SRCH filter.
	If a filter component (equality) is testing the value (ava) against a
	large valueset (like uniquemember values), it takes a long time because
	of the large number of values and required normalization of the values.
	This can be improved taking benefit of sorted valueset. Those sorted
	valueset were created to improve updates of large valueset (groups) but
	at that time not implemented in SRCH path.

Fix description:
	In case of LDAP_FILTER_EQUALITY component, the server can get
	benefit of the sorted valuearray.
	To limit the risk of regression, we use the sorted valuearray
	only for the DN syntax attribute. Indeed the sorted valuearray was
	designed for those type of attribute.
	With those two limitations, there is no need of a toggle and
	the call to plugin_call_syntax_filter_ava can be replaced by
	a call to slapi_valueset_find.
	In both cases, sorted valueset and plugin_call_syntax_filter_ava, ava and
	values are normalized.
	In sorted valueset, the values have been normalized to insert the index
	in the sorted array and then comparison is done on normalized values.
	In plugin_call_syntax_filter_ava, all values in valuearray (of valueset) are normalized
	before comparison.

relates: #6172

Reviewed by: Pierre Rogier, Simon Pichugin (Big Thanks !!!)
---
 .../tests/suites/filter/filter_test.py        | 125 ++++++++++++++++++
 ldap/servers/slapd/filterentry.c              |  22 ++-
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/dirsrvtests/tests/suites/filter/filter_test.py b/dirsrvtests/tests/suites/filter/filter_test.py
index d6bfa5a3bc..4baaf04a75 100644
--- a/dirsrvtests/tests/suites/filter/filter_test.py
+++ b/dirsrvtests/tests/suites/filter/filter_test.py
@@ -9,7 +9,11 @@
 import logging
 
 import pytest
+import time
+from lib389.dirsrv_log import DirsrvAccessLog
 from lib389.tasks import *
+from lib389.backend import Backends, Backend
+from lib389.dbgen import dbgen_users, dbgen_groups
 from lib389.topologies import topology_st
 from lib389._constants import PASSWORD, DEFAULT_SUFFIX, DN_DM, SUFFIX
 from lib389.utils import *
@@ -304,6 +308,127 @@ def test_extended_search(topology_st):
     ents = topology_st.standalone.search_s(SUFFIX, ldap.SCOPE_SUBTREE, myfilter)
     assert len(ents) == 1
 
+def test_match_large_valueset(topology_st):
+    """Test that when returning a big number of entries
+    and that we need to match the filter from a large valueset
+    we get benefit to use the sorted valueset
+
+    :id: 7db5aa88-50e0-4c31-85dd-1d2072cb674c
+
+    :setup: Standalone instance
+
+    :steps:
+         1. Create a users and groups backends and tune them
+         2. Generate a test ldif (2k users and 1K groups with all users)
+         3. Import test ldif file using Offline import (ldif2db).
+         4. Prim the 'groups' entrycache with a "fast" search
+         5. Search the 'groups' with a difficult matching value
+         6. check that etime from step 5 is less than a second
+
+    :expectedresults:
+         1. Create a users and groups backends should PASS
+         2. Generate LDIF should PASS.
+         3. Offline import should PASS.
+         4. Priming should PASS.
+         5. Performance search should PASS.
+         6. Etime of performance search should PASS.
+    """
+
+    log.info('Running test_match_large_valueset...')
+    #
+    # Test online/offline LDIF imports
+    #
+    inst = topology_st.standalone
+    inst.start()
+    backends = Backends(inst)
+    users_suffix = "ou=users,%s" % DEFAULT_SUFFIX
+    users_backend = 'users'
+    users_ldif = 'users_import.ldif'
+    groups_suffix = "ou=groups,%s" % DEFAULT_SUFFIX
+    groups_backend = 'groups'
+    groups_ldif = 'groups_import.ldif'
+    groups_entrycache = '200000000'
+    users_number = 2000
+    groups_number = 1000
+
+
+    # For priming the cache we just want to be fast
+    # taking the first value in the valueset is good
+    # whether the valueset is sorted or not
+    priming_user_rdn = "user0001"
+
+    # For performance testing, this is important to use
+    # user1000 rather then user0001
+    # Because user0001 is the first value in the valueset
+    # whether we use the sorted valuearray or non sorted
+    # valuearray the performance will be similar.
+    # With middle value user1000, the performance boost of
+    # the sorted valuearray will make the difference.
+    perf_user_rdn = "user1000"
+
+    # Step 1. Prepare the backends and tune the groups entrycache
+    try:
+        be_users = backends.create(properties={'parent': DEFAULT_SUFFIX, 'nsslapd-suffix': users_suffix, 'name': users_backend})
+        be_groups = backends.create(properties={'parent': DEFAULT_SUFFIX, 'nsslapd-suffix': groups_suffix, 'name': groups_backend})
+
+        # set the entry cache to 200Mb as the 1K groups of 2K users require at least 170Mb
+        be_groups.replace('nsslapd-cachememsize', groups_entrycache)
+    except:
+        raise
+
+    # Step 2. Generate a test ldif (10k users entries)
+    log.info("Generating users LDIF...")
+    ldif_dir = inst.get_ldif_dir()
+    users_import_ldif = "%s/%s" % (ldif_dir, users_ldif)
+    groups_import_ldif = "%s/%s" % (ldif_dir, groups_ldif)
+    dbgen_users(inst, users_number, users_import_ldif, suffix=users_suffix, generic=True, parent=users_suffix)
+
+    # Generate a test ldif (800 groups with 10k members) that fit in 700Mb entry cache
+    props = {
+        "name": "group",
+        "suffix": groups_suffix,
+        "parent": groups_suffix,
+        "number": groups_number,
+        "numMembers": users_number,
+        "createMembers": False,
+        "memberParent": users_suffix,
+        "membershipAttr": "uniquemember",
+    }
+    dbgen_groups(inst, groups_import_ldif, props)
+
+    # Step 3. Do the both offline imports
+    inst.stop()
+    if not inst.ldif2db(users_backend, None, None, None, users_import_ldif):
+        log.fatal('test_basic_import_export: Offline users import failed')
+        assert False
+    if not inst.ldif2db(groups_backend, None, None, None, groups_import_ldif):
+        log.fatal('test_basic_import_export: Offline groups import failed')
+        assert False
+    inst.start()
+
+    # Step 4. first prime the cache
+    # Just request the 'DN'. We are interested by the time of matching not by the time of transfert
+    entries = topology_st.standalone.search_s(groups_suffix, ldap.SCOPE_SUBTREE, "(&(objectclass=groupOfUniqueNames)(uniquemember=uid=%s,%s))" % (priming_user_rdn, users_suffix), ['dn'])
+    assert len(entries) == groups_number
+
+    # Step 5. Now do the real performance checking it should take less than a second
+    # Just request the 'DN'. We are interested by the time of matching not by the time of transfert
+    search_start = time.time()
+    entries = topology_st.standalone.search_s(groups_suffix, ldap.SCOPE_SUBTREE, "(&(objectclass=groupOfUniqueNames)(uniquemember=uid=%s,%s))" % (perf_user_rdn, users_suffix), ['dn'])
+    duration = time.time() - search_start
+    log.info("Duration of the search was %f", duration)
+
+    # Step 6. Gather the etime from the access log
+    inst.stop()
+    access_log = DirsrvAccessLog(inst)
+    search_result = access_log.match(".*RESULT err=0 tag=101 nentries=%s.*" % groups_number)
+    log.info("Found patterns are %s", search_result[0])
+    log.info("Found patterns are %s", search_result[1])
+    etime = float(search_result[1].split('etime=')[1])
+    log.info("Duration of the search from access log was %f", etime)
+    assert len(entries) == groups_number
+    assert (etime < 1)
+
 if __name__ == '__main__':
     # Run isolated
     # -s for DEBUG mode
diff --git a/ldap/servers/slapd/filterentry.c b/ldap/servers/slapd/filterentry.c
index 2a7102828a..4de4aa66ee 100644
--- a/ldap/servers/slapd/filterentry.c
+++ b/ldap/servers/slapd/filterentry.c
@@ -296,7 +296,27 @@ test_ava_filter(
         rc = -1;
         for (; a != NULL; a = a->a_next) {
             if (slapi_attr_type_cmp(ava->ava_type, a->a_type, SLAPI_TYPE_CMP_SUBTYPE) == 0) {
-                rc = plugin_call_syntax_filter_ava(a, ftype, ava);
+                if ((ftype == LDAP_FILTER_EQUALITY) &&
+                    (slapi_attr_is_dn_syntax_type(a->a_type))) {
+                    /* This path is for a performance improvement */
+
+                    /* In case of equality filter we can get benefit of the
+                     * sorted valuearray (from valueset).
+                     * This improvement is limited to DN syntax attributes for
+                     * which the sorted valueset was designed.
+                     */
+                    Slapi_Value *sval = NULL;
+                    sval = slapi_value_new_berval(&ava->ava_value);
+                    if (slapi_valueset_find((const Slapi_Attr *)a, &a->a_present_values, sval)) {
+                        rc = 0;
+                    }
+                    slapi_value_free(&sval);
+                } else {
+                    /* When sorted valuearray optimization cannot be used
+                     * lets filter the value according to its syntax
+                     */
+                    rc = plugin_call_syntax_filter_ava(a, ftype, ava);
+                }
                 if (rc == 0) {
                     break;
                 }