# Parallelism

In [1]:
import pandas as pd
import numpy as np

c:\programas\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
c:\programas\anaconda3\lib\site-packages\numpy\.libs\libopenblas.TXA6YQSD3GCQQC22GEQ54J2UDCXDXHWN.gfortran-win_amd64.dll
  stacklevel=1)


# Silly example

In [2]:
import time

def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [4]:
my_sleep(5)

Sleeping for 5 seconds.
Returning 5


5

In [3]:
my_list = [1,2,3,4,5,6]

In [6]:
sum(my_list)

21

In [4]:
from tqdm.auto import tqdm

## Serial code

In [8]:
for item in tqdm(my_list):
    my_sleep(item)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6



In [9]:
map(my_sleep, my_list)

<map at 0x1d7174c7e10>

In [10]:
list(map(my_sleep, my_list))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6


[1, 2, 3, 4, 5, 6]

## Parallel code

In [5]:
from multiprocessing import Pool, cpu_count

cpu_count()

6

## You have to create a pool of `n` process.

In [14]:
pool = Pool(processes=6)

### We'll `use the magic function` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
pool.map(my_sleep, my_list)

## This happens because multiprocessing not always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [8]:
from sleeper import my_sleep_from_file

In [8]:
pool = Pool(processes=6)

In [10]:
my_list

[1, 2, 3, 4, 5, 6]

In [9]:
pool.map(my_sleep_from_file, my_list)

[1, 2, 3, 4, 5, 6]

In [11]:
pool.terminate()

In [9]:
pool = Pool(processes=2)

In [10]:
result = pool.map(my_sleep_from_file, my_list)
pool.terminate()

In [11]:
result

[1, 2, 3, 4, 5, 6]

`!pip install multiprocess`

## Using multiprocess


In [12]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool

In [13]:
pool = Pool(processes=6)

In [14]:
result = pool.map(my_sleep, [1,2,3,4,5,6])
pool.terminate()

NameError: name 'time' is not defined

In [15]:
def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    import time    
    
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [20]:
pool = Pool(processes=6)

In [21]:
result = pool.map(my_sleep, [1,2,3,4,5,6,7,8])
pool.terminate()

# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [22]:
pool = Pool(processes=6)

In [23]:
result = pool.map_async(my_sleep, [60, 60, 60, 60, 60, 60])

In [42]:
result.successful()

True

In [39]:
result.ready()

True

In [38]:
result.wait()

In [40]:
result.get()

[60, 60, 60, 60, 60, 60]

In [43]:
result = pool.map_async(my_sleep, [60, 60, 60, 60, 60, 60])

In [47]:
result.ready()

False

In [48]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

Do something that doesn"t depend on result
...
Now the time came when the result is needed.
Now go on and use the results obtained - [60, 60, 60, 60, 60, 60]


# CPU intensive computations

In [49]:
def square(x):
    return x ** 2

In [52]:
n = 1000000

In [55]:
random_numbers = np.random.random(size=n)

In [56]:
result = [square(item) for item in random_numbers]

In [57]:
pool = Pool(processes=6)

In [58]:
result = pool.map(square, random_numbers)
pool.terminate()

In [None]:
# GIL - global interpreter lock

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



# When is multiprocess useful then? 


## I/O bound computations

In [59]:
import pandas as pd

In [60]:
import requests

In [64]:
response = requests.get(f'https://www.hltv.org/results?offset=100')
df = pd.concat(pd.read_html(response.text))
df

Unnamed: 0,0,1,2,3,4
0,Sprout,2 - 0,AGF,Nine to Five 4,bo3
0,Gambit Youngsters,2 - 1,Hard Legion,LOOT.BET Season 7,bo3
0,Espada,1 - 2,Secret,Nine to Five 4,bo3
0,AVEZ,2 - 0,SG.pro,Nine to Five 4,bo3
0,Bantz,6 - 16,VERTEX,ESEA MDL Season 35 Australia,nuke
...,...,...,...,...,...
0,Sixth Gear,6 - 16,TeamOne,ESEA MDL Season 35 North America,inf
0,Isurus,16 - 6,Keyd,CLUTCH Season 3,ovp
0,Rugratz,8 - 16,Mythic,ESEA MDL Season 35 North America,ovp
0,RBG,11 - 16,timbermen,ESEA MDL Season 35 North America,ovp


In [68]:
colnames = ['team_a','score_a','score_b','team_b','event','stars']
my_range = range(550)


for i in tqdm(my_range):
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')
    
    

SyntaxError: 'return' outside function (<ipython-input-68-69d6385309da>, line 10)

In [69]:
def download(i):
    import requests
    import pandas as pd
    
    response = requests.get(f'https://www.hltv.org/results?offset={i * 100}')
    df = pd.concat(pd.read_html(response.text))
    df.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')
    
    return df

In [70]:
pool = Pool(processes=6)
results = pool.map_async(download, my_range)

In [99]:
results.ready()

True

In [98]:
import os

len(os.listdir('tmp'))

550

In [101]:
pd.concat(results.get())

Unnamed: 0,0,1,2,3,4
0,G2,0 - 2,ENCE,ESL Pro League Season 12 Europe,bo3
0,BIG,1 - 2,Heroic,ESL Pro League Season 12 Europe,bo3
0,Natus Vincere,2 - 0,AGO,ESL Pro League Season 12 Europe,bo3
0,Vitality,1 - 2,Astralis,ESL Pro League Season 12 Europe,bo3
0,100 Thieves,2 - 0,Chaos,ESL Pro League Season 12 North America,bo3
...,...,...,...,...,...
0,swissRAGE,16 - 3,myRevenge,ESL Major Series Winter Season 2012,trn
0,Absolute Legends,16 - 9,Hawks,DreamHack Winter 2012,d2
0,VeryGames,16 - 3,3DMAX,DreamHack Winter 2012,nuke
0,VeryGames,16 - 12,Hawks,DreamHack Winter 2012,d2


In [102]:
pool.terminate()