In [None]:
import pandas as pd

feature_info = {
                    #  название_колонки     агг.функция               список кол-ва соседей, по которым считать агг. функцию
    'new_col_name_1': ('original_col_name_1',     lambda x: x.sum(),                                [10, 20, 100]),
    'new_col_name_2': ('original_col_name_1',     lambda x: x.mean(),                                [11, 21, 101]),
    'new_col_name_3': ('original_col_name_2',     lambda x: x.min() % 3,                [50, 80, 100])
}
pd.DataFrame(feature_info, index=['col_name', 'func', 'k']).T.explode('k').reset_index(names='new_col')

Unnamed: 0,new_col,col_name,func,k
0,new_col_name_1,original_col_name_1,<function <lambda> at 0x0000018EB9671550>,10
1,new_col_name_1,original_col_name_1,<function <lambda> at 0x0000018EB9671550>,20
2,new_col_name_1,original_col_name_1,<function <lambda> at 0x0000018EB9671550>,100
3,new_col_name_2,original_col_name_1,<function <lambda> at 0x0000018EB9684790>,11
4,new_col_name_2,original_col_name_1,<function <lambda> at 0x0000018EB9684790>,21
5,new_col_name_2,original_col_name_1,<function <lambda> at 0x0000018EB9684790>,101
6,new_col_name_3,original_col_name_2,<function <lambda> at 0x0000018EB9684700>,50
7,new_col_name_3,original_col_name_2,<function <lambda> at 0x0000018EB9684700>,80
8,new_col_name_3,original_col_name_2,<function <lambda> at 0x0000018EB9684700>,100


In [None]:
#!pip install hnswlib
#!pip install pynndescent

In [None]:
import numpy as np
import pandas as pd

import functools as F

import hnswlib
import pynndescent

class KNNFeatureAggregator:
    supported_indexes = ('hnsw', 'pynndescent')

    def __init__(self, name: str, params: dict):
        if not name in KNNFeatureAggregator.supported_indexes:
            raise NotImplementedError(f"index {name} doesn't support")

        self.name = name
        self.tr_data = None

        if self.name == 'hnsw':
            self.idx = hnswlib.Index(**params)
        if self.name == 'pynndescent':
            self.idx = F.partial(pynndescent.NNDescent, **params)

    def train(self, tr_data: pd.DataFrame, tr_params: dict):
        self.tr_data = tr_data

        if self.name == 'hnsw':
            self.idx.init_index(**tr_params)
            self.idx.add_items(tr_data.values, np.arange(tr_data.shape[0]))
        if self.name == 'pynndescent':
            self.idx = self.idx(tr_data.values)
            self.idx.prepare()


    def kneighbors(self, query, k, is_train, q_params):
        shift = int(is_train)

        if self.name == 'hnsw':
            self.idx.set_ef(q_params['ef_search'])
            return self.idx.knn_query(query, k=k+shift)[0][:, shift:]
        if self.name == 'pynndescent':
            return self.idx.query(query, k+shift)[0][:, shift:]


    def make_features(self, neighbor_ids, feature_info):
        if self.tr_data is None:
            raise RuntimeError('call `train` before')

        new_features = pd.DataFrame({})

        for name in feature_info.keys():
            col, apply, ann_list = feature_info[name]
            for ann in ann_list:
                col_vals = self.tr_data[col].values
                trunc_neighbor_ids = neighbor_ids[:, :ann]

                if isinstance(apply, str):
                    applied = getattr(col_vals[trunc_neighbor_ids], apply)(axis=1)
                else:
                    applied = np.apply_along_axis(apply, 1, col_vals[trunc_neighbor_ids])

                new_features[name + f'_{ann}nn'] = applied

        return new_features

### Пример

Ваш:

In [None]:
index_info_hnsw = {
    # see https://github.com/nmslib/hnswlib
    "name": "hnsw",
    "params": {
        "space": 'l2',
        "dim": 3
    },
}
index_info_pynndescent = {
    # see https://pynndescent.readthedocs.io/en/latest/api.html
    "name": "pynndescent",
    "params": {
        "metric": 'euclidean',
        "n_neighbors": 3,
        "leaf_size": 20,
        "n_trees": 2,
        "pruning_degree_multiplier": 2.0,
        "diversify_prob": 1.0,
        "verbose": False
    },
}

index_tr_params_hnsw = {
    "max_elements": 100,
    "ef_construction": 5,
    "M": 3
}
index_tr_params_pynndescent = {}

index_q_params_hnsw = {"ef_search": 2}
index_q_params_pynndescent = {"epsilon": 0.2}

train_data = pd.DataFrame(np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [-1, 0, 0],
    [0, -1, 0],
    [0, 0, -1],
], dtype=np.float64))
train_data.columns = ['x', 'y', 'z']

knn_agg = KNNFeatureAggregator(**index_info_hnsw)
knn_agg.train(train_data, index_tr_params_hnsw)
neighbor_ids = knn_agg.kneighbors(train_data, 4, True, index_q_params_hnsw)
neighbor_ids

array([[1, 2, 4, 5],
       [0, 2, 3, 5],
       [0, 1, 3, 4],
       [1, 2, 4, 5],
       [0, 2, 3, 5],
       [0, 1, 3, 4]], dtype=uint64)

Нашло все без ошибок

Точками датасета являются - центры граней куба [-1;1]^3

из-за симметрии работает не детерминированно (разный порядок индексов, т.к. одинаковые расстояния)

In [None]:
knn_features = knn_agg.make_features(neighbor_ids, feature_info={
    'x_sum': ('x', 'sum', [4]),
    'y_min': ('y', lambda x: x.min(), [4]),
    'y_max': ('y', 'max', [4])
})
knn_features

Unnamed: 0,x_sum_4nn,y_min_4nn,y_max_4nn
0,0.0,-1.0,1.0
1,0.0,0.0,0.0
2,0.0,-1.0,1.0
3,0.0,-1.0,1.0
4,0.0,0.0,0.0
5,0.0,-1.0,1.0


Недостатки:
- нет реализации с репрезентативным семплом
- нет возможности работать с батчами (а датасеты нынче большие), можно сделать генераторы
- нет возможности добавлять новые элементы (с другой стороны не все индексы это поддерживают)