In [None]:
import time
import os

In [None]:
import multiprocessing as mp
mp.cpu_count()

2

In [None]:
! python -V

Python 3.7.12


# Single Process
按照顺序，逐步执行.

In [None]:
def long_time_task():
    print(f'当前进程: {os.getpid()}')
    time.sleep(2)
    print(f"结果: {8 ** 20}")

In [None]:
print(f'当前母进程: {os.getpid()}')
start = time.time()
for i in range(2):
    long_time_task()

end = time.time()
print(f"用时{(end-start)}秒")

当前母进程: 58
当前进程: 58
结果: 1152921504606846976
当前进程: 58
结果: 1152921504606846976
用时4.004814386367798秒


# Multi-Processing
The multiprocessing package offers both local and remote concurrency, effectively side-stepping the `Global Interpreter Lock` by using `subprocesses` instead of `threads`.

>In Python, the `Global Interpreter Lock (GIL)` is a `lock` that allows **only a single thread to control the Python interpreter**.<br> 
- In the case of `multithreading`, which is primarily used for `IO-bound` jobs, `GIL` doesn’t have much impact as the **lock is shared between threads while they are waiting for `I/O`**. <br>
- `Multiprocessing`, on the other hand, **allocates a Python Interpreter and GIL to every process**.

reference:
-  [How to Use the Multiprocessing Package in Python](https://towardsdatascience.com/how-to-use-the-multiprocessing-package-in-python3-a1c808415ec2) - Flyte/Lyft

# Process Class
The Process class in multiprocessing **allocates all the tasks in the memory in one go**. 

Every task created using the `Process` class has to have a **separate memory allocated**.


In [None]:
def long_time_task(i):
    print(f'子进程: {os.getpid()} - 任务{i}')
    time.sleep(2)
    print(f"结果: {8 ** 20}")

In [None]:
from multiprocessing import Process

print('当前母进程: {}'.format(os.getpid()))
start = time.time()
p1 = Process(target=long_time_task, args=(1,))
p2 = Process(target=long_time_task, args=(2,))

print('等待所有子进程完成。')
p1.start()
p2.start()
p1.join()
p2.join()
end = time.time()

print("总共用时{}秒".format((end - start)))

当前母进程: 66
等待所有子进程完成。
子进程: 111 - 任务1
子进程: 112 - 任务2
结果: 1152921504606846976
结果: 1152921504606846976
总共用时2.040834426879883秒


In [None]:
import time
from multiprocessing import Process


def cube(x):
    print(f"start process {x}")
    print(x * x * x)
    time.sleep(1)
    print(f"end process {x}")


if __name__ == "__main__":
    processes = []
    for i in range(10):
        # initiate a process
        p = Process(target=cube, args=(i,))
        processes.append(p)
        p.start()  # commence the process

    # All the processes have been looped over to wait until every process execution is complete
    for p in processes:
        p.join()


start process 0
start process 1
0
start process 2
1
8
start process 3
start process 4
64
27
start process 5
start process 6
125
216
start process 7
343
start process 8
512
start process 9
729
end process 0
end process 1
end process 2
end process 4
end process 3
end process 5
end process 6
end process 7
end process 8
end process 9


## Pipe
If two processes need to communicate, Pipe’s the best choice. A pipe can have two end-points where each has `send()` and `recv()` methods. Data in a pipe could get corrupted if two processes (threads) read from or write to the same end-point simultaneously.

## Queue
To store the output of multiple processes in a shared communication channel, a `queue` can be used. 

For instance, assume that the task is to find the cubes of the first ten natural numbers followed by adding 1 to each number.

In [None]:
# below is an infinite loop
from multiprocessing import Process, Queue

 
def cube(x, q):
    q.put(x * x * x)
 
 
def add(x, q):
    q.put(x + 1)


if __name__ == "__main__":
    q = Queue()
    processes = []
    for i in range(10):  # add 10 cubes to queue
        p = Process(target=cube, args=(i, q,))
        processes.append(p)
        p.start()
 
    for p in processes:
        p.join()
 
    processes = []
    print("INITIAL VALUES: ")
    while not q.empty():  # consuming queue, and call add
        val = q.get()
        print(val)
        p = Process(target=add, args=(val, q,))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()
 
    print("FINAL VALUES: ")
    while not q.empty():
        print(q.get())

# Pool Class

The `Pool` class in `multiprocessing` can handle an enormous number of processes. It allows you to run multiple jobs per process (due to its ability to queue the jobs). 

The memory is allocated only to the executing processes, unlike the `Process` class, which allocates memory to all the processes. The Pool class takes the number of worker processes to be present in the pool and spawns the processes.

A prime example of this is the `Pool` object which offers a convenient means of parallelizing the execution of a function across multiple input values, distributing the input data across processes (**data parallelism**).

## apply
`apply()` method blocks the primary process until all the processes are complete. It accepts multiple arguments, **maintains the order of the result**, and **isn’t concurrent**.

In [None]:
import time
from multiprocessing import Pool


def cube(x):
    print(f"start process {x}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}")
    return result


if __name__ == "__main__":
    ts = time.time()
    pool = Pool(processes=4)
    print([pool.apply(cube, args=(x,)) for x in range(10)])
    pool.close()
    pool.join()
    print("Time in parallel:", time.time() - ts)

start process 0
end process 0
start process 1
end process 1
start process 2
end process 2
start process 3
end process 3
start process 4
end process 4
start process 5
end process 5
start process 6
end process 6
start process 7
end process 7
start process 8
end process 8
start process 9
end process 9
[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
Time in parallel: 10.165384769439697


## [map](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool.map)
`map()` method supports concurrency — doesn’t accept multiple arguments and blocks the main program until all the processes are complete. 
- It also maintains the order of the result (return)
- although the computation order could differ!


In [None]:
import time
from multiprocessing import Pool


def cube(x):
    print(f"start process {x}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}")
    return result


if __name__ == "__main__":
    pool = Pool(processes=4)
    print(pool.map(cube, range(5)))
    pool.close()
    pool.join()


start process 1
start process 0
start process 3
start process 2
end process 3
end process 1
start process 4
end process 0
end process 2
end process 4
[0, 1, 8, 27, 64]


In [None]:
import time
from multiprocessing import Pool


def cube(x):
    print(f"start process {x}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}")
    print(result)  
    # if you do save record here. Save record won't be in the right sequence


if __name__ == "__main__":
    pool = Pool(processes=4)
    print(pool.map(cube, range(5)))
    pool.close()
    pool.join()


In [None]:
import time
from multiprocessing import Pool


def cube(x):
    print(f"start process {x}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}")

    # map只是return的时候有顺序，这种save之类的是没有顺序的
    with open("./cube.txt", "a") as f: 
        f.write(f"{result}\n")

if __name__ == "__main__":
    pool = Pool(processes=4)
    pool.map(cube, range(5))
    pool.close()
    pool.join()
    print("writing finished")
    with open("./cube.txt", "r") as f:
        content = f.read().splitlines()
    print(content)


start process 0
start process 1
start process 2
start process 3
end process 0
start process 4
end process 1
end process 2
end process 3
end process 4
writing finished
['0', '1', '27', '8', '64']


## apply_async
A callback function in `apply_async()` can be used to return the value immediately after its execution is complete. 

This method maintains the order of the result and supports `concurrency`.

In [None]:
from multiprocessing import Pool, cpu_count

In [None]:
def long_time_task(i):
    print(f'子进程: {os.getpid()} - 任务{i}')
    time.sleep(2)
    print(f"结果: {8 ** 20}")

In [None]:
if __name__=='__main__':
    print(f"CPU内核数:{cpu_count()}")
    print(f'当前母进程: {os.getpid()}')
    
    start = time.time()
    p = Pool(cpu_count())
    for i in range(5):
        p.apply_async(long_time_task, args=(i,))
    
    print('等待所有子进程完成。')
    p.close()
    p.join()
    
    end = time.time()
    print("总共用时{}秒".format((end - start)))

CPU内核数:2
当前母进程: 68
子进程: 13567 - 任务0
子进程: 13568 - 任务1
等待所有子进程完成。
结果: 1152921504606846976
子进程: 13567 - 任务2
结果: 1152921504606846976
子进程: 13568 - 任务3
结果: 1152921504606846976
子进程: 13567 - 任务4
结果: 1152921504606846976
结果: 1152921504606846976
总共用时6.141690015792847秒


In [None]:
import time
from multiprocessing import Pool


def collect_result(val):
    return val


def cube(x):
    print(f"start process {x}")
    time.sleep(1)
    print(f"end process {x}")
    return x * x * x


if __name__ == "__main__":
    pool = Pool(processes=4)
    for x in range(5):
        print(pool.apply_async(cube, args=(x,), callback=collect_result).get())
    pool.close()
    pool.join()


start process 0
end process 0
start process 1
0
end process 1
start process 2
1
end process 2
start process 3
8
end process 3
start process 4
27
end process 4
64


## map_async


In [None]:
import time
from multiprocessing import Pool


def collect_result(val):
    return val


def cube(x):
    print(f"start process {x}")
    time.sleep(1)
    print(f"end process {x}")
    return x * x * x


def cube_print(x):
    print(x * x * x)


if __name__ == "__main__":
    pool = Pool(processes=4)
    print(pool.map_async(cube, range(10), callback=collect_result).get())
    pool.map_async(cube_print, range(10))
    print("HERE!")
    print("HERE AGAIN!")
    pool.close()
    pool.join()

# 这个python版本有问题... 我本地跑起来HERE! 都是在最后一个print list之前的
# “HERE” and “HERE AGAIN” are written to the console when map_async() runs, 
# showcasing its non-blocking nature. However, you can use wait() to block the asynchronous calls.

start process 0
start process 1
start process 2
start process 3
end process 0
start process 4
end process 1
end process 2
start process 5
start process 6
end process 3
start process 7
end process 4
start process 8
end process 5
end process 6
end process 7
start process 9
end process 8
end process 9
1
27
64
125
216
8
0
729
343
512
[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
HERE!
HERE AGAIN!


## imap
1. `pool.imap(func, input)`: `input` 可以是`iterable`, `iterator`. <br>
下面那个`pd.DataFrame`作为`input`的例子特别好.
2. `imap`的output是个`Iterator`, 所以通常是用`for loop`, 来不停iterate


In [None]:
import time
from multiprocessing import Pool

tmp_file

def cube(x):
    print(f"start process {x}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}")
    return result


if __name__ == "__main__":
    pool = Pool(processes=4)
    for each_item in pool.imap(cube, range(5)):
        print(each_item)
    pool.close()
    # pool.join()  # join好像没啥用啊...
    print("this should be last line")


start process 2
start process 0
start process 3
start process 1
end process 1
end process 3
start process 4
end process 0
end process 2
0
1
8
27
end process 4
64
this should be last line


In [None]:
import os
import time
from multiprocessing import Pool

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass

def cube(x):
    print(f"start process {x}: {os.getpid()}")
    result = x * x * x
    time.sleep(1)
    
    # save order won't be guaranteed
    with open(tmp_file, "a") as f:
        f.write(f"{result}\n")
    print(f"end process {x}: {os.getpid()}")
    return result

if __name__ == "__main__":
    res = []
    with Pool(processes=4) as pool:
        for each_item in pool.imap(cube, range(5)):  # range(5) is iterable
            res.append(each_item)
            print(f"##### current process #####: {os.getpid()}")
    
    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")
    print(f"returned results: {res}")
    print(f"current process: {os.getpid()}")

start process 2: 1677
start process 0: 1675
start process 3: 1678
start process 1: 1676
end process 2: 1677
start process 4: 1677
end process 1: 1676
end process 0: 1675
end process 3: 1678
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
end process 4: 1677
##### current process #####: 67
show content: ['8\n', '0\n', '27\n', '1\n', '64\n']
returned results: [0, 1, 8, 27, 64]
current process: 67


In [None]:
import os 
import time
from multiprocessing import Pool

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass

def cube(x):
    print(f"start process {x}: {os.getpid()}")
    result = x * x * x
    time.sleep(1)
    print(f"end process {x}: {os.getpid()}")
    
    # save order won't be guaranteed    
    with open(tmp_file, "a") as f:
        f.write(f"{result}\n")
    # return result

if __name__ == "__main__":
    data = list(range(12))  # 用list作为imap的input
    with Pool(processes=4) as pool:
        for each_item in pool.imap(cube, data):  # 这便是自动做iteration的
            each_item
            print(f"##### current process #####: {os.getpid()}")
    # pool.join()
    print("this should be 2nd last line")

    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")
    print(f"current process: {os.getpid()}")

start process 1: 1707
start process 2: 1708
start process 0: 1706
start process 3: 1709
end process 1: 1707
start process 4: 1707
end process 0: 1706
start process 5: 1706
end process 2: 1708
end process 3: 1709
start process 6: 1708
start process 7: 1709
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
end process 4: 1707
start process 8: 1707
end process 5: 1706
end process 6: 1708
start process 9: 1706
start process 10: 1708
end process 7: 1709
start process 11: 1709
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
end process 8: 1707
end process 11: 1709
end process 10: 1708
end process 9: 1706
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
##### current process #####: 67
this should be 2nd last line
show content: ['1\n', '0\n', '8\n', '27\n', '64\n', '125\n', '216\n', '343\n', 

## imap_unordered

### `imap_unordered` with `List`

In [None]:
import os
import time
from typing import List, Tuple
from multiprocessing import Pool

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass

def cube(x: List[int]):
    print(f"start process {os.getpid()}: {x}")
    result = x[0] * x[1]
    time.sleep(1)
    
    with open(tmp_file, "a") as f:
        f.write(f"{result}\n")
    print(f"end process {os.getpid()}: {x}")
    
    return result
    

if __name__ == "__main__":
    data = [[i, 2] for i in range(12)]
    # [[0, 2], [1, 2], [2, 2], [3, 2], [4, 2], [5, 2], [6, 2], ...]
    result = []
    
    with Pool(processes=4) as pool:
        for each_item in pool.imap_unordered(cube, data):
            result.append(each_item)
            print(f"##### current process #####: {os.getpid()}")
    
    pool.join()  # wait for the worker processes to terminate

    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")
    print(f"results: {result}")
    print(f"final process: {os.getpid()}")

start process 279: [2, 2]
start process 277: [1, 2]
start process 278: [3, 2]
start process 276: [0, 2]
end process 279: [2, 2]
start process 279: [4, 2]
end process 276: [0, 2]
start process 276: [5, 2]
end process 278: [3, 2]
end process 277: [1, 2]
start process 278: [6, 2]
start process 277: [7, 2]
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
end process 279: [4, 2]
start process 279: [8, 2]
end process 276: [5, 2]
start process 276: [9, 2]
end process 278: [6, 2]
start process 278: [10, 2]
end process 277: [7, 2]
start process 277: [11, 2]
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
end process 279: [8, 2]
end process 276: [9, 2]
end process 278: [10, 2]
end process 277: [11, 2]
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
show content

In [None]:
import os
import time
from typing import List, Tuple
from multiprocessing import Pool

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass

def cube(x: List[int]):
    print(f"start process {x}")
    result = x[0] * x[1]
    time.sleep(1)
    
    print(f"end process {x}")
    
    return result
    

if __name__ == "__main__":
    data = [[0, 2], [1, 2], [2, 2], [3, 2], [4, 2], [5, 2], [6, 2]]
    result = []
    
    with open(tmp_file, "a") as f:
        with Pool(processes=4) as pool:
            for each_item in pool.imap_unordered(cube, data):        
                f.write(f"{each_item}\n")
                result.append(each_item)
    pool.join()
    print("this should be 2nd last line")

    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")
    print(f"results: {result}")

start process [1, 2]
start process [2, 2]
start process [0, 2]
start process [3, 2]
end process [2, 2]
end process [1, 2]
end process [0, 2]
start process [4, 2]
start process [5, 2]
start process [6, 2]
end process [3, 2]
end process [5, 2]
end process [4, 2]
end process [6, 2]
this should be 2nd last line
show content: ['4\n', '0\n', '2\n', '6\n', '10\n', '8\n', '12\n']
results: [4, 0, 2, 6, 10, 8, 12]


### `imap_unordered` with enumerate

In [None]:
list(enumerate(range(5,10)))

[(0, 5), (1, 6), (2, 7), (3, 8), (4, 9)]

In [None]:
import os
import time
from typing import List, Tuple
from multiprocessing import Pool

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass

def cube(x: List[int]):
    print(f"start process {os.getpid()}: {x}")
    result = x
    time.sleep(1)
    
    with open(tmp_file, "a") as f:
        f.write(f"{result}\n")
    print(f"end process {os.getpid()}: {x}")
    
    return result
    

if __name__ == "__main__":
    data = list(range(12))
    # [0, 1, 2, 3, ..., 11]
    result = []
    
    with Pool(processes=4) as pool:
        for each_item in pool.imap_unordered(cube, enumerate(data)):
            result.append(each_item)
            print(f"##### current process #####: {os.getpid()}")
    
    pool.join()  # wait for the worker processes to terminate

    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")
    print(f"results: {result}")
    print(f"final process: {os.getpid()}")

start process 217: (0, 0)
start process 219: (2, 2)
start process 218: (1, 1)
start process 220: (3, 3)
end process 217: (0, 0)
end process 219: (2, 2)
end process 218: (1, 1)
start process 217: (4, 4)
end process 220: (3, 3)
start process 220: (6, 6)
start process 218: (7, 7)
start process 219: (5, 5)
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
end process 220: (6, 6)
start process 220: (8, 8)
end process 217: (4, 4)
end process 218: (7, 7)
end process 219: (5, 5)
start process 217: (10, 10)
start process 218: (9, 9)
start process 219: (11, 11)
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
end process 220: (8, 8)
end process 217: (10, 10)
end process 218: (9, 9)
end process 219: (11, 11)
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
##### current process #####: 66
show con

### `imap_unordered` with `pd.DataFrame`
This is not recommended. Code is tricky


In [None]:
import pandas as pd
data = {"a": [2]*5, "b": list(range(5))}
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,a,b
0,2,0
1,2,1
2,2,2
3,2,3
4,2,4


In [None]:
it = df.iterrows()
it

<generator object DataFrame.iterrows at 0x7f00a4b6c450>

In [None]:
tmp = next(it)[1]
tmp

a    2
b    0
Name: 0, dtype: int64

In [None]:
print(isinstance(tmp, pd.Series))
type(tmp)

True


pandas.core.series.Series

In [None]:
import os
import time
from typing import List, Tuple
from multiprocessing import Pool

import pandas as pd

tmp_file = "cube.txt"

try:
    os.remove(tmp_file)
except OSError:
    pass


def cube(x: pd.Series):
    print(f"start process {x}")
    x = x[1]
    result = x.a * x.b
    time.sleep(1)
    print(f"end process {x}")
    
    # save order won't be guaranteed
    with open(tmp_file, "a") as f:
        f.write(f"{result}\n")
    # return result

if __name__ == "__main__":
    data = {"a": [2]*5, "b": list(range(5))}
    df = pd.DataFrame.from_dict(data)
    
    with Pool(processes=4) as pool:

      for each_item in pool.imap_unordered(cube, df.iterrows()):  # using iterrows()
          each_item
    # pool.join()
    print("this should be 2nd last line")

    with open(tmp_file, "r") as f:
        content = f.readlines()
    print(f"show content: {content}")


start process (0, a    2
b    0
Name: 0, dtype: int64)
start process (2, a    2
b    2
Name: 2, dtype: int64)start process (1, a    2
b    1
Name: 1, dtype: int64)start process (3, a    2
b    3
Name: 3, dtype: int64)


end process a    2
b    0
Name: 0, dtype: int64
start process (4, a    2
b    4
Name: 4, dtype: int64)
end process a    2
b    1
Name: 1, dtype: int64
end process a    2
b    2
Name: 2, dtype: int64end process a    2
b    3
Name: 3, dtype: int64

end process a    2
b    4
Name: 4, dtype: int64
this should be 2nd last line
show content: ['0\n', '2\n', '4\n', '6\n', '8\n']


In [None]:
import pandas as pd
data = {"a": [2]*5, "b": list(range(5))}
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,a,b
0,2,0
1,2,1
2,2,2
3,2,3
4,2,4


In [None]:
for a, b in df.items():
    print(a)
    print(b)

a
0    2
1    2
2    2
3    2
4    2
Name: a, dtype: int64
b
0    0
1    1
2    2
3    3
4    4
Name: b, dtype: int64


In [None]:
b

0    0
1    1
2    2
3    3
4    4
Name: b, dtype: int64

In [None]:
a

'b'

## Multi-process data sharing and communication

In [None]:
from multiprocessing import Process, Queue
import os, time, random


# 写数据进程执行的代码:
def write(q):
    print('Process to write: {}'.format(os.getpid()))
    for value in ['A', 'B', 'C']:
        print('Put %s to queue...' % value)
        q.put(value)
        time.sleep(random.random())

# 读数据进程执行的代码:
def read(q):
    print('Process to read:{}'.format(os.getpid()))
    while True:
        value = q.get(True)
        print('Get %s from queue.' % value)

In [None]:
if __name__=='__main__':
    # 父进程创建Queue，并传给各个子进程：
    q = Queue()
    pw = Process(target=write, args=(q,))
    pr = Process(target=read, args=(q,))
    # 启动子进程pw，写入:
    pw.start()
    # 启动子进程pr，读取:
    pr.start()
    # 等待pw结束:
    pw.join()
    # pr进程里是死循环，无法等待其结束，只能强行终止:
    pr.terminate()

Process to write: 623
Process to read:624
Put A to queue...
Get A from queue.
Put B to queue...
Get B from queue.
Put C to queue...
Get C from queue.


Example

In [None]:
from multiprocessing import cpu_count, Pool, Process, Queue

l = list(range(20))
q1 = Queue()
q2 = Queue()

def producer(q1, l):
  for i in l:
    q1.put(i)
  print("producer done")

def long_time_task(i):
  print(f"pid: {os.getpid()}")
  return i**10

def worker():
  p = Pool(cpu_count())
  print(f"#cpus: {cpu_count()}")
  for i in range(5):
    while q1.qsize() > 0:
      input = q1.get()
      res = p.apply_async(long_time_task, args=(input,))
      print(f"res: {res.get()}")
      q2.put(res.get())

  print('等待所有子进程完成。')  
  p.close()
  p.join()
  print("worker done")

  # p = Pool(cpu_count())
  # for i in range(5):
  #     p.apply_async(long_time_task, args=(i,))
  
  # print('等待所有子进程完成。')
  # p.close()
  # p.join()

def receiver(q2):
  res = []
  while q2.qsize() > 0:
    res.append(q2.get())
  return res

In [None]:
producer(q1, l)

producer done


In [None]:
worker()

pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
pid: 1126
pid: 1127
#cpus: 2
res: 0
res: 1
res: 1024
res: 59049
res: 1048576
res: 9765625
res: 60466176
res: 282475249
res: 1073741824
res: 3486784401
res: 10000000000
res: 25937424601
res: 61917364224
res: 137858491849
res: 289254654976
res: 576650390625
res: 1099511627776
res: 2015993900449
res: 3570467226624
res: 6131066257801
等待所有子进程完成。
worker done


In [None]:
worker()

#cpus: 2
等待所有子进程完成。
worker done


In [None]:
res = receiver(q2)

In [None]:
res

[0,
 1,
 1024,
 59049,
 1048576,
 9765625,
 60466176,
 282475249,
 1073741824,
 3486784401,
 10000000000,
 25937424601,
 61917364224,
 137858491849,
 289254654976,
 576650390625,
 1099511627776,
 2015993900449,
 3570467226624,
 6131066257801]

## Multi-processing Quick Implementation
- [How to utilize all cores with python multiprocessing](https://stackoverflow.com/a/19098791)
- [imap_unordered](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool.imap_unordered)


In [None]:
import multiprocessing
from multiprocessing import cpu_count

def get_reconciliation_state_attributes(single_input):
  return single_input

events = []
grouping_messages = [1, 2, 3]

with multiprocessing.Pool(cpu_count()) as pool:
  for event in pool.imap_unordered(get_reconciliation_state_attributes, grouping_messages):
    events.append(event)

print(events)

[1, 2, 3]


# Examples

## IntToBitArrayConverter

Single Process way:

In [None]:
from typing import Dict

import numpy as np
import random


class IntToBitarrayConverter():
    """convert ints to binary representation of 0s and 1s."""
    
    def set_bitstring_cache(self, bitstring_cache: Dict):
        """
        在class外, 把int转成bitstring, 然后再存进class attr.
        这个会被pass to each multiprocess.
        """
        # cache the step of bitstring = format(integer, 'b')
        # bitstring_cache = {"integer": format(integer, "b")}
        self.bitstring_cache = bitstring_cache

    def convert(self, integer: int) -> np.ndarray:
        """
        int -> bitstring -> ndarray
        """
        bitstring = self.bitstring_cache[integer]
        
        return self._bitstring_to_ndarray(bitstring)

    @staticmethod
    def _bitstring_to_ndarray(bitstring) -> np.ndarray:
        """ 
        bitstring -> ndarray
        eg. 
        int: 12 -> bitstring: '1100'
        bitstring '1100' -> np.ndarray: [1, 1, 0, 0]
        """
        arr = (np.fromstring(bitstring, 'u1') - 48)
        return arr

In [None]:
# binary format
bitstring = format(12, "b")

arr = np.fromstring(bitstring, "u1")

print(bitstring)
arr

1100


  after removing the cwd from sys.path.


array([49, 49, 48, 48], dtype=uint8)

In [None]:
%%time
CACHE_SIZE = 1024 * 1024  # 40 MB
ITER_SIZE = 1_000_000  # 1 million

int_to_bitarr_converter = IntToBitarrayConverter()

# 
# eg. {12: "1100"}
int_to_bitarr_converter.set_bitstring_cache(
    {key: format(key, 'b') for key in range(CACHE_SIZE)}
)

ndarray_bitarr_ls = list(
    map(int_to_bitarr_converter.convert, 
        (
            random.randint(0, CACHE_SIZE - 1) for _ in range(ITER_SIZE)
        )
    )
)



CPU times: user 6.34 s, sys: 108 ms, total: 6.44 s
Wall time: 6.49 s


### Inefficient Multiprocess way


In [None]:
%%time
from multiprocessing import Pool

CACHE_SIZE = 1024 * 1024  # 40 MB
ITER_SIZE = 1000000  # 1 million

int_to_bitarr_converter = IntToBitarrayConverter()
int_to_bitarr_converter.set_bitstring_cache(
    {key: format(key, 'b') for key in range(CACHE_SIZE)}
)  # class attr里面存了一个dict, 每个subprocess里面都会有这个dict, 中途还会反复serialize和deserialize好多次

with Pool() as pool:
    # 这个很慢, 会序列化+反序列化N次
    ndarray_bitarr_ls = pool.map(
        int_to_bitarr_converter.convert, 
        (
            random.randint(0, CACHE_SIZE - 1) for _ in range(ITER_SIZE)
        )
    )



CPU times: user 9.97 s, sys: 610 ms, total: 10.6 s
Wall time: 16.4 s


In [None]:
len(ndarray_bitarr_ls)

1000000

Under the hood, our call to `pool.map(...)` does the following (base on CPython [source code](https://github.com/python/cpython/blob/main/Lib/multiprocessing/pool.py)):

1. Initializes `N` Queues:

    1. The `taskqueue` which holds tuple of tasks: `(result_job, func, (x,), {})`.

        - We only care about `(x,)` above. This holds our function `convert()`, and a **chunk** of elements from our iterable.

    2. The `inqueue`, which holds **serialized (pickled)** `tasks`.
    3. The `outqueue`, which will holds **serialized (pickled)** return values of each `task`.
2. Creates a pool of “worker” `Processes`, which are responsible for:
    1. Removing tasks from the `inqueue`, which are deserialized, and executing the task.
    2. Executing each `task`, and sending the results to the `outqueue`, where it is **serialized** and stored.
3. Creates N Threads which manage the above N Queues:
    1. The `_task_handler` which populates the `inqueue` with **pickled** `task` objects, from the `taskqueue`
    2. The `_worker_handler` which “reuses” workers by re-creating them once their work is done.
    3. The `_result_handler` which “removes” elements off of the `outqueue`, which are **deserialized**, and returned to your parent process call to `Pool.map()`.

Re-read the above again and note everywhere you read `serialize`, `deserialize` or `pickle`. Objects must be `serialized` to a `str` before being shuttled to each process, and then that process must `deserialize` that `str` to re-create the object. This needs to happen on the return journey of the data also. That’s **2 calls to pickle.dumps()* and **2 calls to pickle.loads()** per task!

when you have an **instance method** with a large object bound to it, passing this method to `Pool.map(...)` results in a **huge** performance loss due to repeated serializing/deserializing of the large object between processes.

### Efficient Global Solution
**Using Class**

We inherit from IntToBitarrayConverter, and make three changes:

1. `bitstring_cache` becomes a **class attribute**
2. `convert()` becomes an `@classmethod`
3. `set_bitstring_cache()` becomes an `@classmethod`

>**Note**: What we’ve done here is essentially bundle 2 global functions, with 1 global variable, and co-locate them within the same class. This is quietly approaching the **singleton** anti-pattern, but I prefer the encapsulation here over **globals**, despite it being a dangerous facade…

In [None]:
from typing import Dict

import numpy as np
import random


class IntToBitarrayConverter():
    """convert ints to binary representation of 0s and 1s."""
    
    def set_bitstring_cache(self, bitstring_cache: Dict):
        """
        在class外, 把int转成bitstring, 然后再存进class attr.
        这个会被pass to each multiprocess.
        """
        # cache the step of bitstring = format(integer, 'b')
        # bitstring_cache = {"integer": format(integer, "b")}
        self.bitstring_cache = bitstring_cache

    def convert(self, integer: int) -> np.ndarray:
        """
        int -> bitstring -> ndarray
        """
        bitstring = self.bitstring_cache[integer]
        
        return self._bitstring_to_ndarray(bitstring)

    @staticmethod
    def _bitstring_to_ndarray(bitstring) -> np.ndarray:
        """ 
        bitstring -> ndarray
        eg. 
        int: 12 -> bitstring: '1100'
        bitstring '1100' -> np.ndarray: [1, 1, 0, 0]
        """
        arr = (np.fromstring(bitstring, 'u1') - 48)
        return arr

class ClassMethodBitarrayConverter(IntToBitarrayConverter):
    bitstring_cache = None

    @classmethod
    def set_bitstring_cache(cls, bitstring_cache: Dict):
        cls.bitstring_cache = bitstring_cache

    @classmethod
    def convert(cls, integer: int, init_return=None) -> np.ndarray:
        bitstring = cls.bitstring_cache[integer]
        return cls._bitstring_to_ndarray(bitstring)

In [None]:
%%time
from multiprocessing import Pool

CACHE_SIZE = 1024 * 1024  # 40 MB
ITER_SIZE = 1_000_000  # 1 million

# int_to_bitarr_converter = ClassMethodBitarrayConverter()
ClassMethodBitarrayConverter.set_bitstring_cache(
    {key: format(key, 'b') for key in range(CACHE_SIZE)}
)  # class attr里面存了一个dict

with Pool() as pool:
    ndarray_bitarr_ls = pool.map(
        ClassMethodBitarrayConverter.convert, 
        (
            random.randint(0, CACHE_SIZE - 1) for _ in range(ITER_SIZE)
        )
    )



CPU times: user 4.55 s, sys: 286 ms, total: 4.84 s
Wall time: 12.2 s


## Queue and Context Manager

In [None]:
from datetime import datetime
import logging
from multiprocessing import Manager, Queue, Process
from typing import Any, Dict, Union
import uuid

import numpy as np


log = logging.getLogger(__name__)

AttributeValue = Dict[str, Any]
AttributeName = Dict[str, AttributeValue]
Group = Dict[str, AttributeName]
Namespace = Dict[str, Group]
Event = Dict[str, Union[str, Namespace]]

SENTINEL = "END"


def validate(event: dict) -> dict:
    """Validates event against the schema defined in schema.json

    Args:
        event: unvalidated event

    Returns: Same event if validation is passed

    Raises: JsonSchemaException if event is invalid
    """
    return True


class ValidationError(Exception):
    @property
    def message(self):
        if self.__cause__:
            return self.__cause__.message
        return ""

    def __str__(self):
        if self.__cause__:
            return str(self.__cause__)
        return super().__str__()


def generate_payload(attributes: Event) -> Event:
    return {
        "_dt": datetime.now().isoformat(),
        "_uuid": str(uuid.uuid4()),
        "_version": "2.0.0",
        "_source": "EVENT_SOURCE",
        "_type": "reconciliation.pair",
        "_status": "v",
        **attributes,
    }

def events_generator(attributes_batch) -> Generator:
    for attributes in attributes_batch:
        if not attributes:
            continue
        payload = generate_payload(attributes)
        print(f"payload: {payload}")
        try:
            validate(payload)
        except ValidationError:
            log.error("invalid events: %s"%payload)
            continue
        yield payload


class AsyncFireman:
    """Send events to s3 by putting them on a queue, which is constantly being fetched
    by a number of background processes.

    The queue is bounded to rate limit the producer and cap its memory usage.

    Use this as a context manager to make sure that resources are cleaned up properly.
    """

    def __init__(self, num_workers):
        self.num_workers = num_workers
        self._workers = []
        self.queue = Manager().Queue(maxsize=num_workers)

    def __enter__(self):
        for _ in range(self.num_workers):
            self._workers.append(Process(target=self._worker_func))

        log.info(f"Starting {self.num_workers} fireman workers")

        for worker in self._workers:
            worker.daemon = True
            worker.start()

        return self

    def _worker_func(self):
        """Consume event attributes from the queue, wrap it in envelope and send the event to s3 by fireman
        """
        for attributes_batch in iter(self.queue.get, SENTINEL):  # type: ignore
            events = events_generator(attributes_batch)
            print(f"events: {events}")
            list(events)
            self.queue.task_done()

        log.info("Fireman worker terminated")
        self.queue.task_done()

    def __exit__(self, exc_type, exc_val, exc_tb):
        for _ in range(self.num_workers):
            self.queue.put(SENTINEL)

        self.queue.join()

        for worker in self._workers:
            worker.join()
        log.info("All fireman workers terminated")

    # def send_matching_events(
    #     self, matchers, ids_all_partition_pairs: List[Tuple[np.ndarray, np.ndarray]]
    # ):
    #     """Tells the matchers which queue to put event batches on and asks matchers in turn
    #     to start sending events. Then the idle fireman workers will fetch items from the
    #     queue and send to s3.
    #     """
    #     for matcher in matchers:
    #         matcher.event_queue = self.queue

    #     with mp.Pool(processes=settings.NUM_PROCESSING_WORKERS) as pool:
    #         for matcher in matchers:
    #             pool.map(
    #                 matcher.send_matching_events, ids_all_partition_pairs,
    #             )


In [None]:
attrs = (
            {
                "_dt": "2020-01-01T13:14:15.123456Z",
                "_uuid": "00000000-0000-4000-8000-000000000000",
                "_version": "1.0.0",
                "_source": "figueres",
                "_type": "test_type",
                "id": "retailer_product:1-retailer_product:2",
                "entity_type": "retailer_product",
                "id_a": "1",
                "id_b": "2",
                "live": {
                    "core": {
                        "duplicate": {
                            "model": {
                                "score": 0.98,
                                "source": "image_baseline_v2",
                                "dt": "2021-11-13T20:20:39+00:00",
                            }
                        }
                    }
                },
            },
        )

with AsyncFireman(num_workers=2) as af:
    af.queue.put((attrs))
    af.queue.put((attrs[0], attrs[0], None))


events: <generator object events_generator at 0x7f764f14de50>
payload: {'_dt': '2020-01-01T13:14:15.123456Z', '_uuid': '00000000-0000-4000-8000-000000000000', '_version': '1.0.0', '_source': 'figueres', '_type': 'test_type', '_status': 'v', 'id': 'retailer_product:1-retailer_product:2', 'entity_type': 'retailer_product', 'id_a': '1', 'id_b': '2', 'live': {'core': {'duplicate': {'model': {'score': 0.98, 'source': 'image_baseline_v2', 'dt': '2021-11-13T20:20:39+00:00'}}}}}
events: <generator object events_generator at 0x7f764f14de50>
payload: {'_dt': '2020-01-01T13:14:15.123456Z', '_uuid': '00000000-0000-4000-8000-000000000000', '_version': '1.0.0', '_source': 'figueres', '_type': 'test_type', '_status': 'v', 'id': 'retailer_product:1-retailer_product:2', 'entity_type': 'retailer_product', 'id_a': '1', 'id_b': '2', 'live': {'core': {'duplicate': {'model': {'score': 0.98, 'source': 'image_baseline_v2', 'dt': '2021-11-13T20:20:39+00:00'}}}}}
payload: {'_dt': '2020-01-01T13:14:15.123456Z', 