In [1]:
from joblib import Parallel, delayed

In [16]:
with Parallel(n_jobs=-1) as parallel:
    def sqrt(x):
        return x**0.5
    
    accumulator = 0.
    n_iter = 0
    while accumulator < 1000:
        results = parallel(delayed(sqrt) (accumulator + i **2) for i in range(5))
        accumulator += sum(results)
        n_iter += 1
        
    print(f'Computate to the value {accumulator} in {n_iter} iterations')

Computate to the value 1136.5969161564717 in 14 iterations


In [15]:
accumulator = 0.
n_iter = 0
while accumulator < 1000: 
    results = 0.
    for i in range(5): 
        results += sqrt(accumulator + i**2)
    accumulator += (results)
    n_iter += 1
        
print(f'Computate to the value {accumulator} in {n_iter} iterations')
        

Computate to the value 1136.5969161564717 in 14 iterations


#### Entendiendo los diferentes tipos de backend de joblib con aleatoriedad

#### Entendiendo los diferentes tipos de backend de joblib con aleatoriedad

##### Primero vamos a ver que pasa al correr de manera secuencial un experimento con aleatoriedad

In [4]:
import numpy as np
from joblib import Parallel, delayed

def printvector(vector, backend):
    print(f'\nThe different generated vectors using the {backend} backend are:\n { np.array(vector)}')

def generarcoordenadas(max):
    return np.random.randint(max, size = 5)

nvectors = 5

randomvector = [generarcoordenadas(15) for i in range(nvectors)]

print(f'\nThe different generated vectors in a sequential manner are: \n{np.array(randomvector)}')


The different generated vectors in a sequential manner are: 
[[13 10  9 14  6]
 [ 0  1 11  4 13]
 [ 9 11 12  6  8]
 [11 11  8  1 11]
 [ 5  7  6  2  3]]


##### Luego vamos a ver que pasa con backend = 'loky' y 'threading'

In [10]:
backend = 'loky'
random_vector = Parallel(n_jobs=4, backend=backend)(delayed(
    generarcoordenadas)(15) for _ in range(nvectors))
printvector(random_vector, backend)

backend = 'threading'
random_vector = Parallel(n_jobs=4, backend=backend)(delayed(generarcoordenadas)(15) for _ in range(nvectors))
printvector(random_vector, backend)


The different generated vectors using the loky backend are:
 [[10  0  8  9  8]
 [ 2  7  5  5  5]
 [ 0  6  8  3  1]
 [14  7  0 13  2]
 [14  1  3  7 11]]

The different generated vectors using the threading backend are:
 [[ 1  1  9  6  3]
 [13 10 13  9  6]
 [ 1 14  0  8  9]
 [14  2 14 12  3]
 [10  3  7  3 14]]


#### Observemos que en estos casos la paralelización preserva aleatoriedad, pasemos ahora a multiprocessing

In [17]:
import multiprocessing as mp

backend = 'multiprocessing'
nvectors = 10
def generarcoordenadas(max):
    seed = np.random.randint(1000) # add a random seed value
    return np.random.RandomState(seed).randint(max, size=5)


random_vector = Parallel(n_jobs = 5, backend = backend)( delayed(generarcoordenadas)(15) for i in range(nvectors))
printvector(random_vector, backend)



The different generated vectors using the multiprocessing backend are:
 [[13 13  2  0 12]
 [13 13  2  0 12]
 [13 13  2  0 12]
 [13 13  2  0 12]
 [13 13  2  0 12]
 [ 5 14  1  8  5]
 [ 5 14  1  8  5]
 [ 5 14  1  8  5]
 [ 5 14  1  8  5]
 [11  4  0  1 11]]


#### YA NO SE CUMPLE la aleatoriedad, observemos que obtendremos nvectors // n_jobs grupos iguales lo cual muestra que la aleatoriedad ya no se está cumpliendo esto sucede porque el estado global del generador de números aleatorios de numpy se duplicará exactamente en todos los trabajadores. Es decir todos tienen la misma random seed

### Esto podemos arreglarlo cambiando el estado de generador de números aleatorios usando RandomState y seeds que lo cambien por cada proceso

In [20]:
import numpy as np
from joblib import Parallel, delayed

def printvector(vector, backend):
    print(f'\nThe different generated vectors using the {backend} backend are:\n { np.array(vector)}')

def generarcoordenadas(max, seed):
    rng = np.random.RandomState(seed)
    return rng.randint(max, size=5)

backend = 'multiprocessing'
nvectors = 10


random_vector = Parallel(n_jobs=5, backend=backend)(delayed(generarcoordenadas)(15, None) for i in range(nvectors))
printvector(random_vector, backend)



The different generated vectors using the multiprocessing backend are:
 [[10 10  2 14  6]
 [ 6 12  8  8 10]
 [ 5  4  4 10  5]
 [ 9  2  1 11  7]
 [14  4 12 10  5]
 [ 2 14  1 11 14]
 [ 3 14 11 12 13]
 [ 3  3  8  7  4]
 [ 8 13  3 13  5]
 [14  3  6  0  8]]


## Busquemos la forma de aplicar Paralelización a apply functions de pandas o para usarlo en dataframes

In [23]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

df = pd.DataFrame({
    'col1': [1, 2, 3, 4, 5],
    'col2': [6, 7, 8, 9, 10]
})


def toString(cell): 
    return str(cell)

print(df)
df['col2'] = df['col2'].apply(lambda x: Parallel(n_jobs = 4)(delayed(toString)(x)))
print(df)

   col1  col2
0     1     6
1     2     7
2     3     8
3     4     9
4     5    10


TypeError: cannot unpack non-iterable function object