# executor

> executor functions and classes

In [None]:
#| default_exp executor

In [None]:
#| hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from emb_opt.imports import *
from emb_opt.utils import batch_list, unbatch_list

  from .autonotebook import tqdm as notebook_tqdm


### Executors

`Executor` classes are helper classes for batching and parallel processing

In [None]:
#| export

class Executor():
    '''
    Basic Executor class. Batches inputs, sends 
    batches to `function`, unbatches outputs
    '''
    def __init__(self, 
                 function: Callable, # function to be wrapped
                 batched: bool,      # if inputs should be batched
                 batch_size: int=1   # batch size (set batch_size=0 to pass all inputs)
                ):
        self.function = function
        self.batched = batched
        self.batch_size = batch_size
    
    def batch_inputs(self, inputs: List[BaseModel]):
        if self.batched:
            inputs = batch_list(inputs, self.batch_size)
        return inputs
            
    def unbatch_inputs(self, results: List[BaseModel]):
        if self.batched:
            results = unbatch_list(results)
        return results

    def execute(self, inputs: List[BaseModel]):
        results = [self.function(i) for i in inputs]
        return results
        
    def __call__(self, inputs: List[BaseModel]) -> List[BaseModel]:
        
        inputs = self.batch_inputs(inputs)
        results = self.execute(inputs)
        results = self.unbatch_inputs(results)
            
        return results

In [None]:
class TestInput(BaseModel):
    value: float
        
class TestOutput(BaseModel):
    result: bool
        
def test_function(input: TestInput) -> TestOutput:
    return TestOutput(result=input.value>0.5)

def test_function_batched(inputs: list[TestInput]) -> list[TestOutput]:
    return [TestOutput(result=i.value>0.5) for i in inputs]
        
np.random.seed(42)
values = np.random.uniform(size=100).tolist()
inputs = [TestInput(value=i) for i in values]
expected_outputs = [TestOutput(result=i>0.5) for i in values]

# standard

executor = Executor(test_function, batched=False)
res1 = executor(inputs)
assert res1 == expected_outputs

executor = Executor(test_function_batched, batched=True, batch_size=5)
res2 = executor(inputs)
assert res2 == expected_outputs

In [None]:
#| export

class ProcessExecutor(Executor):
    '''
    ProcessExecutor - executes function with 
    multiprocessing using `ProcessPoolExecutor`
    '''
    def __init__(self,
                 function: Callable,           # function to be wrapped
                 batched: bool,                # if inputs should be batched
                 batch_size: int=1,            # batch size (set batch_size=0 to pass all inputs)
                 concurrency: Optional[int]=1  # number of concurrent processes
                ):
        
        self.function = function
        self.batched = batched
        self.concurrency = concurrency
        self.batch_size = batch_size
        
    def execute(self, inputs: List[BaseModel]):
        if (self.concurrency is None) or (self.concurrency==1):
            results = [self.function(i) for i in inputs]
        else:
            with ProcessPoolExecutor(min(self.concurrency, len(inputs))) as p:
                results = list(p.map(self.function, inputs))
            
        return results

In [None]:
class TestInput(BaseModel):
    value: float
        
class TestOutput(BaseModel):
    result: bool
        
def test_function(input: TestInput) -> TestOutput:
    return TestOutput(result=input.value>0.5)

def test_function_batched(inputs: list[TestInput]) -> list[TestOutput]:
    return [TestOutput(result=i.value>0.5) for i in inputs]
        
np.random.seed(42)
values = np.random.uniform(size=100).tolist()
inputs = [TestInput(value=i) for i in values]
expected_outputs = [TestOutput(result=i>0.5) for i in values]

# process

executor = ProcessExecutor(test_function, batched=False, concurrency=1)
res3 = executor(inputs)
assert res3 == expected_outputs

executor = ProcessExecutor(test_function, batched=False, concurrency=2)
res4 = executor(inputs)
assert res4 == expected_outputs

executor = ProcessExecutor(test_function_batched, batched=True, batch_size=5)
res5 = executor(inputs)
assert res5 == expected_outputs

executor = ProcessExecutor(test_function_batched, batched=True, batch_size=5, concurrency=2)
res6 = executor(inputs)
assert res6 == expected_outputs

In [None]:
#| export

class ThreadExecutor(Executor):
    '''
    ProcessExecutor - executes function with 
    multiple threads using `ThreadPoolExecutor`
    '''
    def __init__(self,
                 function: Callable,           # function to be wrapped
                 batched: bool,                # if inputs should be batched
                 batch_size: int=1,            # batch size (set batch_size=0 to pass all inputs)
                 concurrency: Optional[int]=1  # number of concurrent threads
                ):
        
        self.function = function
        self.batched = batched
        self.concurrency = concurrency
        self.batch_size = batch_size
        
    def execute(self, inputs: List[BaseModel]):
        if (self.concurrency is None) or (self.concurrency==1):
            results = [self.function(i) for i in inputs]
        else:
            with ThreadPoolExecutor(min(self.concurrency, len(inputs))) as p:
                results = list(p.map(self.function, inputs))
            
        return results

In [None]:
class TestInput(BaseModel):
    value: float
        
class TestOutput(BaseModel):
    result: bool
        
def test_function(input: TestInput) -> TestOutput:
    return TestOutput(result=input.value>0.5)

def test_function_batched(inputs: list[TestInput]) -> list[TestOutput]:
    return [TestOutput(result=i.value>0.5) for i in inputs]
        
np.random.seed(42)
values = np.random.uniform(size=100).tolist()
inputs = [TestInput(value=i) for i in values]
expected_outputs = [TestOutput(result=i>0.5) for i in values]

# thread

executor = ThreadExecutor(test_function, batched=False, concurrency=1)
res7 = executor(inputs)
assert res7 == expected_outputs

executor = ThreadExecutor(test_function, batched=False, concurrency=2)
res8 = executor(inputs)
assert res8 == expected_outputs

executor = ThreadExecutor(test_function_batched, batched=True, batch_size=5)
res9 = executor(inputs)
assert res9 == expected_outputs

executor = ThreadExecutor(test_function_batched, batched=True, batch_size=5, concurrency=2)
res10 = executor(inputs)
assert res10 == expected_outputs

In [None]:
#| export

class DatasetExecutor(Executor):
    '''
    DatasetExecutor - executes function in parallel 
    using `Dataset.map`
    '''
    def __init__(self,
                 function: Callable,              # function to be wrapped
                 batched: bool,                   # if inputs should be batched
                 batch_size: int=1,               # batch size (set batch_size=0 to pass all inputs)
                 concurrency: Optional[int]=1,    # number of concurrent threads
                 map_kwargs: Optional[dict]=None  # kwargs for `Dataset.map`
                ):
        
        self.function = function
        self.batched = batched
        self.concurrency = concurrency
        self.batch_size = batch_size
        self.map_kwargs = map_kwargs if map_kwargs else {}
        
    def batch_inputs(self, inputs: List[BaseModel]):
        dataset = datasets.Dataset.from_list([i.model_dump() for i in inputs])
        return dataset
            
    def unbatch_inputs(self, dataset):
        return dataset.to_list()

    def execute(self, dataset):
        dataset = dataset.map(lambda row: self.function(row), batched=self.batched, 
                             batch_size=self.batch_size, num_proc=self.concurrency, **self.map_kwargs)
        return dataset

In [None]:
class TestInput(BaseModel):
    value: float
        
class TestOutput(BaseModel):
    result: bool

def test_function_hf(input: dict) -> dict:
    return {'result' : input['value']>0.5}

def test_function_hf_batched(input: dict) -> dict:
    return {'result' : [i>0.5 for i in input['value']]}
        
np.random.seed(42)
values = np.random.uniform(size=100).tolist()
inputs = [TestInput(value=i) for i in values]
expected_outputs = [TestOutput(result=i>0.5) for i in values]

# dataset

executor = DatasetExecutor(test_function_hf, batched=False, concurrency=None, batch_size=1)
res11 = executor(inputs)
assert [TestOutput.model_validate(i) for i in res11] == expected_outputs

executor = DatasetExecutor(test_function_hf, batched=False, concurrency=2, batch_size=1)
res12 = executor(inputs)
assert [TestOutput.model_validate(i) for i in res12] == expected_outputs

executor = DatasetExecutor(test_function_hf_batched, batched=True, concurrency=2, batch_size=5)
res13 = executor(inputs)
assert [TestOutput.model_validate(i) for i in res13] == expected_outputs

executor = DatasetExecutor(test_function_hf_batched, batched=True, concurrency=None, batch_size=5)
res14 = executor(inputs)
assert [TestOutput.model_validate(i) for i in res14] == expected_outputs

                                                                                