uxlfoundation · icfaust · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
@@ -19,53 +19,50 @@
 import threading
 from functools import wraps
 from inspect import Parameter, signature
-from multiprocessing import cpu_count
 from numbers import Integral
 from warnings import warn
 
 import threadpoolctl
+from joblib import cpu_count
 
-from daal4py import daalinit as set_n_threads
-from daal4py import num_threads as get_n_threads
+from daal4py import _get__daal_link_version__, daalinit, num_threads
 
 from ._utils import sklearn_check_version
 
 if sklearn_check_version("1.2"):
     from sklearn.utils._param_validation import validate_parameter_constraints
+else:
 
+    def validate_parameter_constraints(n_jobs):
+        if n_jobs is not None and n_jobs.__class__ != int:
+            raise TypeError(
+                f"n_jobs must be an instance of int, not {n_jobs.__class__.__name__}."
+            )
+
+
+class oneDALLibController(threadpoolctl.LibController):
+    user_api = "oneDAL"
+    internal_api = "oneDAL"
+
+    filename_prefixes = ("libonedal_thread", "libonedal")
+
+    def get_num_threads(self):
+        return num_threads()
+
+    def set_num_threads(self, nthreads):
+        daalinit(nthreads)
+
+    def get_version(self):
+        return _get__daal_link_version__
+
+
+threadpoolctl.register(oneDALLibController)
 
 # Note: getting controller in global scope of this module is required
 # to avoid overheads by its initialization per each function call
 threadpool_controller = threadpoolctl.ThreadpoolController()
 
 
-def get_suggested_n_threads(n_cpus):
-    """
-    Function to get `n_threads` limit
-    if `n_jobs` is set in upper parallelization context.
-    Usually, limit is equal to `n_logical_cpus` // `n_jobs`.
-    Returns None if limit is not set.
-    """
-    n_threads_map = {
-        lib_ctl.internal_api: lib_ctl.get_num_threads()
-        for lib_ctl in threadpool_controller.lib_controllers
-        if lib_ctl.internal_api != "mkl"
-    }
-    # openBLAS is limited to 24, 64 or 128 threads by default
-    # depending on SW/HW configuration.
-    # thus, these numbers of threads from openBLAS are uninformative
-    if "openblas" in n_threads_map and n_threads_map["openblas"] in [24, 64, 128]:
-        del n_threads_map["openblas"]
-    # remove default values equal to n_cpus as uninformative
-    for backend in list(n_threads_map.keys()):
-        if n_threads_map[backend] == n_cpus:
-            del n_threads_map[backend]
-    if len(n_threads_map) > 0:
-        return min(n_threads_map.values())
-    else:
-        return None
-
-
 def _run_with_n_jobs(method):
     """
     Decorator for running of methods containing oneDAL kernels with 'n_jobs'.
@@ -79,59 +76,42 @@ def _run_with_n_jobs(method):
     @wraps(method)
     def n_jobs_wrapper(self, *args, **kwargs):
         # threading parallel backend branch
-        if not isinstance(threading.current_thread(), threading._MainThread):
-            warn(
-                "'Threading' parallel backend is not supported by "
-                "Intel(R) Extension for Scikit-learn*. "
-                "Falling back to usage of all available threads."
-            )
-            result = method(self, *args, **kwargs)
-            return result
         # multiprocess parallel backends branch
         # preemptive validation of n_jobs parameter is required
         # because '_run_with_n_jobs' decorator is applied on top of method
         # where validation takes place
-        if sklearn_check_version("1.2") and hasattr(self, "_parameter_constraints"):
+        if sklearn_check_version("1.2"):
             validate_parameter_constraints(
                 parameter_constraints={"n_jobs": self._parameter_constraints["n_jobs"]},
                 params={"n_jobs": self.n_jobs},
                 caller_name=self.__class__.__name__,
             )
-        # search for specified n_jobs
-        n_jobs = self.n_jobs
-        n_cpus = cpu_count()
+        else:
+            validate_parameter_constraints(self.n_jobs)
+
         # receive n_threads limitation from upper parallelism context
         # using `threadpoolctl.ThreadpoolController`
-        n_threads = get_suggested_n_threads(n_cpus)
         # get real `n_jobs` number of threads for oneDAL
         # using sklearn rules and `n_threads` from upper parallelism context
-        if n_jobs is None or n_jobs == 0:
-            if n_threads is None:
-                # default branch with no setting for n_jobs
-                return method(self, *args, **kwargs)
-            else:
-                n_jobs = n_threads
-        elif n_jobs < 0:
-            if n_threads is None:
-                n_jobs = max(1, n_cpus + n_jobs + 1)
-            else:
-                n_jobs = max(1, n_threads + n_jobs + 1)
-        # branch with set n_jobs
-        old_n_threads = get_n_threads()
-        if n_jobs == old_n_threads:
-            return method(self, *args, **kwargs)
 
-        try:
+        if not self.n_jobs:
+            n_jobs = cpu_count()
+        else:
+            n_jobs = (
+                self.n_jobs if self.n_jobs > 0 else max(1, cpu_count() + self.n_jobs + 1)
+            )
+
+        if (old_n_threads := num_threads()) != n_jobs:
             logger = logging.getLogger("sklearnex")
             cl = self.__class__
             logger.debug(
                 f"{cl.__module__}.{cl.__name__}.{method.__name__}: "
                 f"setting {n_jobs} threads (previous - {old_n_threads})"
             )
-            set_n_threads(n_jobs)
+            with threadpool_controller.limit(limits=n_jobs, user_api="oneDAL"):
+                return method(self, *args, **kwargs)
+        else:
             return method(self, *args, **kwargs)
-        finally:
-            set_n_threads(old_n_threads)
 
     return n_jobs_wrapper
 

@@ -38,6 +38,10 @@
 
 @control_n_jobs(decorated_methods=["fit", "predict"])
 class AdaBoostClassifier(ClassifierMixin, BaseEstimator):
+
+    if sklearn_check_version("1.2"):
+        _parameter_constraints = {}
+
     def __init__(
         self,
         split_criterion="gini",

@@ -38,6 +38,10 @@
 
 
 class GBTDAALBase(BaseEstimator, d4p.mb.GBTDAALBaseModel):
+
+    if sklearn_check_version("1.2"):
+        _parameter_constraints = {}
+
     def __init__(
         self,
         split_method="inexact",

@@ -16,19 +16,15 @@
 
 import inspect
 import logging
-from multiprocessing import cpu_count
+import os
 
 import pytest
+from joblib import cpu_count
 from sklearn.datasets import make_classification
 from sklearn.exceptions import NotFittedError
+from threadpoolctl import threadpool_info
 
-from sklearnex.tests.utils import (
-    PATCHED_MODELS,
-    SPECIAL_INSTANCES,
-    call_method,
-    gen_dataset,
-    gen_models_info,
-)
+from sklearnex.tests.utils import PATCHED_MODELS, SPECIAL_INSTANCES, call_method
 
 _X, _Y = make_classification(n_samples=40, n_features=4, random_state=42)
 
@@ -106,3 +102,29 @@ def test_n_jobs_support(estimator, n_jobs, caplog):
 
         messages = [msg.message for msg in caplog.records]
         assert _check_n_jobs_entry_in_logs(messages, method_name, n_jobs)
+
+
+@pytest.mark.skipif(
+    not hasattr(os, "sched_setaffinity") or len(os.sched_getaffinity(0)) < 2,
+    reason="python CPU affinity control unavailable or too few threads",
+)
+@pytest.mark.parametrize("estimator", {**PATCHED_MODELS, **SPECIAL_INSTANCES}.keys())
+def test_n_jobs_affinity(estimator, caplog):
+    # verify that n_jobs 1) starts at default value of cpu_count
+    # 2) respects os.sched_setaffinity on supported machines
+    n_t = next(i for i in threadpool_info() if i["user_api"] == "oneDAL")["num_threads"]
+
+    # get affinity mask of calling process
+    mask = os.sched_getaffinity(0)
+    # by default, oneDAL should match the number of threads made available to the sklearnex pytest suite
+    assert len(mask) == n_t
+
+    try:
+        # use half of the available threads
+        newmask = set(list(mask)[: len(mask) // 2])
+        os.sched_setaffinity(0, newmask)
+        test_n_jobs_support(estimator, None, caplog)
+
+    finally:
+        # reset affinity mask no matter what
+        os.sched_setaffinity(0, mask)
diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py
@@ -55,9 +55,6 @@
     sklearn_clone_dict,
 )
 
-# to reproduce errors even in CI
-d4p.daalinit(nthreads=100)
-
 _dataset_dict = {
     "classification": [
         partial(load_iris, return_X_y=True),