uxlfoundation · icfaust · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
@@ -433,6 +433,7 @@ jobs:
       - name: Set Environment Variables
         id: set-env
         run: |
+          python -c "import os; print(len(os.sched_getaffinity(0)))"
           echo "NO_DIST=1" >> "$GITHUB_ENV"
           # enable coverage report generation
           echo "COVERAGE_RCFILE=$(readlink -f .coveragerc)" >> "$GITHUB_ENV"

@@ -60,18 +60,19 @@ function generate_pytest_args {
     printf -- "${ARGS[*]}"
 }
 
+${PYTHON} -c "from daal4py import num_threads;print(f'threads={num_threads()}:0')"
 ${PYTHON} -c "from sklearnex import patch_sklearn; patch_sklearn()"
 return_code=$(($return_code + $?))
-
+${PYTHON} -c "from daal4py import num_threads;print(f'threads={num_threads()}:1')"
 pytest --verbose -s "${sklex_root}/tests" $@ $(generate_pytest_args legacy)
 return_code=$(($return_code + $?))
-
+${PYTHON} -c "from daal4py import num_threads;print(f'threads={num_threads()}:2')"
 pytest --verbose --pyargs daal4py $@ $(generate_pytest_args daal4py)
 return_code=$(($return_code + $?))
-
+${PYTHON} -c "from daal4py import num_threads;print(f'threads={num_threads()}:3')"
 pytest --verbose --pyargs sklearnex $@ $(generate_pytest_args sklearnex)
 return_code=$(($return_code + $?))
-
+${PYTHON} -c "from daal4py import num_threads;print(f'threads={num_threads()}:4')"
 pytest --verbose --pyargs onedal $@ $(generate_pytest_args onedal)
 return_code=$(($return_code + $?))
 

@@ -19,53 +19,50 @@
 import threading
 from functools import wraps
 from inspect import Parameter, signature
-from multiprocessing import cpu_count
 from numbers import Integral
 from warnings import warn
 
 import threadpoolctl
+from joblib import cpu_count
 
-from daal4py import daalinit as set_n_threads
-from daal4py import num_threads as get_n_threads
+from daal4py import _get__daal_link_version__, daalinit, num_threads
 
 from ._utils import sklearn_check_version
 
 if sklearn_check_version("1.2"):
     from sklearn.utils._param_validation import validate_parameter_constraints
+else:
 
+    def validate_parameter_constraints(n_jobs):
+        if n_jobs is not None and not isinstance(n_jobs, Integral):
+            raise TypeError(
+                f"n_jobs must be an instance of int, not {n_jobs.__class__.__name__}."
+            )
+
+
+class oneDALLibController(threadpoolctl.LibController):
+    user_api = "onedal"
+    internal_api = "onedal"
+
+    filename_prefixes = ("libonedal_thread", "libonedal")
+
+    def get_num_threads(self):
+        return num_threads()
+
+    def set_num_threads(self, nthreads):
+        daalinit(nthreads)
+
+    def get_version(self):
+        return _get__daal_link_version__()
+
+
+threadpoolctl.register(oneDALLibController)
 
 # Note: getting controller in global scope of this module is required
 # to avoid overheads by its initialization per each function call
 threadpool_controller = threadpoolctl.ThreadpoolController()
 
 
-def get_suggested_n_threads(n_cpus):
-    """
-    Function to get `n_threads` limit
-    if `n_jobs` is set in upper parallelization context.
-    Usually, limit is equal to `n_logical_cpus` // `n_jobs`.
-    Returns None if limit is not set.
-    """
-    n_threads_map = {
-        lib_ctl.internal_api: lib_ctl.get_num_threads()
-        for lib_ctl in threadpool_controller.lib_controllers
-        if lib_ctl.internal_api != "mkl"
-    }
-    # openBLAS is limited to 24, 64 or 128 threads by default
-    # depending on SW/HW configuration.
-    # thus, these numbers of threads from openBLAS are uninformative
-    if "openblas" in n_threads_map and n_threads_map["openblas"] in [24, 64, 128]:
-        del n_threads_map["openblas"]
-    # remove default values equal to n_cpus as uninformative
-    for backend in list(n_threads_map.keys()):
-        if n_threads_map[backend] == n_cpus:
-            del n_threads_map[backend]
-    if len(n_threads_map) > 0:
-        return min(n_threads_map.values())
-    else:
-        return None
-
-
 def _run_with_n_jobs(method):
     """
     Decorator for running of methods containing oneDAL kernels with 'n_jobs'.
@@ -79,59 +76,46 @@ def _run_with_n_jobs(method):
     @wraps(method)
     def n_jobs_wrapper(self, *args, **kwargs):
         # threading parallel backend branch
-        if not isinstance(threading.current_thread(), threading._MainThread):
-            warn(
-                "'Threading' parallel backend is not supported by "
-                "Extension for Scikit-learn*. "
-                "Falling back to usage of all available threads."
-            )
-            result = method(self, *args, **kwargs)
-            return result
         # multiprocess parallel backends branch
         # preemptive validation of n_jobs parameter is required
         # because '_run_with_n_jobs' decorator is applied on top of method
         # where validation takes place
-        if sklearn_check_version("1.2") and hasattr(self, "_parameter_constraints"):
+        if sklearn_check_version("1.2"):
             validate_parameter_constraints(
                 parameter_constraints={"n_jobs": self._parameter_constraints["n_jobs"]},
                 params={"n_jobs": self.n_jobs},
                 caller_name=self.__class__.__name__,
             )
-        # search for specified n_jobs
-        n_jobs = self.n_jobs
-        n_cpus = cpu_count()
+        else:
+            validate_parameter_constraints(self.n_jobs)
+
         # receive n_threads limitation from upper parallelism context
         # using `threadpoolctl.ThreadpoolController`
-        n_threads = get_suggested_n_threads(n_cpus)
         # get real `n_jobs` number of threads for oneDAL
         # using sklearn rules and `n_threads` from upper parallelism context
-        if n_jobs is None or n_jobs == 0:
-            if n_threads is None:
-                # default branch with no setting for n_jobs
-                return method(self, *args, **kwargs)
-            else:
-                n_jobs = n_threads
-        elif n_jobs < 0:
-            if n_threads is None:
-                n_jobs = max(1, n_cpus + n_jobs + 1)
-            else:
-                n_jobs = max(1, n_threads + n_jobs + 1)
-        # branch with set n_jobs
-        old_n_threads = get_n_threads()
-        if n_jobs == old_n_threads:
+
+        if self.n_jobs:
+            n_jobs = (
+                self.n_jobs if self.n_jobs > 0 else max(1, cpu_count() + self.n_jobs + 1)
+            )
+        elif self.n_jobs == 0:
+            # This is a small variation on joblib's equivalent error
+            raise ValueError("n_jobs == 0 has no meaning")
+        else:
             return method(self, *args, **kwargs)
 
-        try:
+        # n_jobs value is attempting to be set
+        if (old_n_threads := num_threads()) != n_jobs:
             logger = logging.getLogger("sklearnex")
             cl = self.__class__
             logger.debug(
                 f"{cl.__module__}.{cl.__name__}.{method.__name__}: "
                 f"setting {n_jobs} threads (previous - {old_n_threads})"
             )
-            set_n_threads(n_jobs)
+            with threadpool_controller.limit(limits=n_jobs, user_api="onedal"):
+                return method(self, *args, **kwargs)
+        else:
             return method(self, *args, **kwargs)
-        finally:
-            set_n_threads(old_n_threads)
 
     return n_jobs_wrapper
 
@@ -185,6 +169,8 @@ def class_wrapper(original_class):
         ):
             parameter_constraints = original_class._parameter_constraints
             if "n_jobs" not in parameter_constraints:
+                # n_jobs = 0 is not allowed, but it is handled elsewhere
+                # This definition matches scikit-learn
                 parameter_constraints["n_jobs"] = [Integral, None]
 
         @wraps(original_init)

@@ -34,6 +34,10 @@
 
 @control_n_jobs(decorated_methods=["fit", "predict"])
 class AdaBoostClassifier(ClassifierMixin, BaseEstimator):
+
+    if sklearn_check_version("1.2"):
+        _parameter_constraints = {}
+
     def __init__(
         self,
         split_criterion="gini",

@@ -34,6 +34,10 @@
 
 
 class GBTDAALBase(BaseEstimator, d4p.mb.GBTDAALBaseModel):
+
+    if sklearn_check_version("1.2"):
+        _parameter_constraints = {}
+
     def __init__(
         self,
         split_method="inexact",

@@ -269,9 +269,10 @@ dal::table convert_to_table(py::object inp_obj, py::object queue, bool recursed)
     return res;
 }
 
-static void free_capsule(PyObject *cap) {
+template <class T>
+void free_capsule(PyObject *cap) {
     // TODO: check safe cast
-    dal::base *stored_array = static_cast<dal::base *>(PyCapsule_GetPointer(cap, NULL));
+    dal::array<T> *stored_array = static_cast<dal::array<T> *>(PyCapsule_GetPointer(cap, NULL));
     if (stored_array) {
         delete stored_array;
     }
@@ -304,7 +305,7 @@ static PyObject *convert_to_numpy_impl(
         throw std::invalid_argument("Conversion to numpy array failed");
 
     void *opaque_value = static_cast<void *>(new dal::array<T>(host_array));
-    PyObject *cap = PyCapsule_New(opaque_value, NULL, free_capsule);
+    PyObject *cap = PyCapsule_New(opaque_value, NULL, free_capsule<T>);
     PyArray_SetBaseObject(reinterpret_cast<PyArrayObject *>(obj), cap);
     return obj;
 }

@@ -103,9 +103,9 @@ ONEDAL_PY_INIT_MODULE(table) {
         return numpy::convert_to_table(obj, queue);
     });
 
-    m.def("from_table", [](const dal::table& t) -> py::handle {
+    m.def("from_table", [](const dal::table& t) -> py::object {
         auto* obj_ptr = numpy::convert_to_pyobject(t);
-        return obj_ptr;
+        return py::reinterpret_steal<py::object>(obj_ptr);
     });
     m.def("dlpack_memory_order", &dlpack::dlpack_memory_order);
     py::enum_<DLDeviceType>(m, "DLDeviceType")

@@ -144,7 +144,7 @@ class IncrementalLinearRegression(
         _parameter_constraints: dict = {
             "fit_intercept": ["boolean"],
             "copy_X": ["boolean"],
-            "n_jobs": [Interval(numbers.Integral, -1, None, closed="left"), None],
+            "n_jobs": [numbers.Integral, None],
             "batch_size": [Interval(numbers.Integral, 1, None, closed="left"), None],
         }
 

@@ -113,7 +113,7 @@ class IncrementalRidge(MultiOutputMixin, RegressorMixin, oneDALEstimator, BaseEs
             "fit_intercept": ["boolean"],
             "alpha": [Interval(numbers.Real, 0, None, closed="left")],
             "copy_X": ["boolean"],
-            "n_jobs": [Interval(numbers.Integral, -1, None, closed="left"), None],
+            "n_jobs": [numbers.Integral, None],
             "batch_size": [Interval(numbers.Integral, 1, None, closed="left"), None],
         }
 

@@ -122,7 +122,7 @@ def gen_functions(functions):
 
 data_shapes = [
     pytest.param((1000, 100), id="(1000, 100)"),
-    pytest.param((2000, 50), id="(2000, 50)"),
+    pytest.param((2000, 40), id="(2000, 40)"),
 ]
 
 EXTRA_MEMORY_THRESHOLD = 0.15

@@ -16,19 +16,15 @@
 
 import inspect
 import logging
-from multiprocessing import cpu_count
+import os
 
 import pytest
+from joblib import cpu_count
 from sklearn.datasets import make_classification
 from sklearn.exceptions import NotFittedError
+from threadpoolctl import threadpool_info
 
-from sklearnex.tests.utils import (
-    PATCHED_MODELS,
-    SPECIAL_INSTANCES,
-    call_method,
-    gen_dataset,
-    gen_models_info,
-)
+from sklearnex.tests.utils import PATCHED_MODELS, SPECIAL_INSTANCES, call_method
 
 _X, _Y = make_classification(n_samples=40, n_features=4, random_state=42)
 
@@ -49,7 +45,7 @@ def _check_n_jobs_entry_in_logs(records, function_name, n_jobs):
         if f"{function_name}: setting {expected_n_jobs} threads" in rec:
             return True
     # False if n_jobs is set and not found in logs
-    return n_jobs is None
+    return n_jobs is None or expected_n_jobs == cpu_count()
 
 
 @pytest.mark.parametrize("estimator", {**PATCHED_MODELS, **SPECIAL_INSTANCES}.keys())
@@ -106,3 +102,31 @@ def test_n_jobs_support(estimator, n_jobs, caplog):
 
         messages = [msg.message for msg in caplog.records]
         assert _check_n_jobs_entry_in_logs(messages, method_name, n_jobs)
+
+
+@pytest.mark.skipif(
+    not hasattr(os, "sched_setaffinity") or len(os.sched_getaffinity(0)) < 4,
+    reason="python CPU affinity control unavailable or too few threads",
+)
+@pytest.mark.parametrize("estimator", {**PATCHED_MODELS, **SPECIAL_INSTANCES}.keys())
+def test_n_jobs_affinity(estimator, caplog):
+    # verify that n_jobs 1) starts at default value of cpu_count
+    # 2) respects os.sched_setaffinity on supported machines
+    n_t = next(i for i in threadpool_info() if i["user_api"] == "onedal")["num_threads"]
+
+    # get affinity mask of calling process
+    mask = os.sched_getaffinity(0)
+    # by default, oneDAL should match the number of threads made available to the sklearnex pytest suite
+    # This is currently disabled due to thread setting occurring in test_run_to_run_stability
+    # assert len(mask) == n_t
+
+    try:
+        # use half of the available threads
+        newmask = set(list(mask)[: len(mask) // 2])
+        os.sched_setaffinity(0, newmask)
+        # -2 is used as this forces n_jobs to be based on cpu_count and must value match in test
+        test_n_jobs_support(estimator, -2, caplog)
+
+    finally:
+        # reset affinity mask no matter what
+        os.sched_setaffinity(0, mask)