# Drill - Threading



### Exercise 1
In the `data/` folder, you have 10 files that contain Shakespears sonnets. You have to gather all these files into one file `data_all.txt` using threads. Be careful, the sonnets must appear in order.

In [1]:
import os

def get_files_name():
    # get the list of text files in the directory
    path_of_the_directory = 'data'
    filename_list = []
    ext = ('.txt')
    for file in os.listdir(path_of_the_directory):
        if file.endswith(ext) & (file != 'data_all.txt')&(file !='data_part_10.txt'):
            filename_list.append(file) 
        else:
            continue

    filename_list.append('data_part_10.txt')
    return filename_list


In [2]:

def write_files(dir):
    # write the content of the files into data_all.txt file
   
    print(f"Task {dir}: starting. ")

    with open('data/data_all.txt', 'a') as af:
        with open(dir,'r') as rf:
            af.write(rf.read())
    print(f"Task {dir}: finished. ")
        

In [3]:
# 1. with out concurrency
from time import perf_counter

with open("data/data_all.txt",'w') as wf:# first delete the file incase there is some text already
        pass

start_time = perf_counter()
filename_list = get_files_name()
for filename in filename_list:
    dir = 'data/' + filename
    write_files(dir)
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")   

Task data/data_part_1.txt: starting. 
Task data/data_part_1.txt: finished. 
Task data/data_part_2.txt: starting. 
Task data/data_part_2.txt: finished. 
Task data/data_part_3.txt: starting. 
Task data/data_part_3.txt: finished. 
Task data/data_part_4.txt: starting. 
Task data/data_part_4.txt: finished. 
Task data/data_part_5.txt: starting. 
Task data/data_part_5.txt: finished. 
Task data/data_part_6.txt: starting. 
Task data/data_part_6.txt: finished. 
Task data/data_part_7.txt: starting. 
Task data/data_part_7.txt: finished. 
Task data/data_part_8.txt: starting. 
Task data/data_part_8.txt: finished. 
Task data/data_part_9.txt: starting. 
Task data/data_part_9.txt: finished. 
Task data/data_part_10.txt: starting. 
Task data/data_part_10.txt: finished. 

Time spent inside the loop: 0.007544499938376248 seconds.


In [4]:
# 2. using threading
from threading import Thread
threads = list()
start_time = perf_counter()

with open("data/data_all.txt",'w') as wf:# first delete the file incase there is some text already
        pass

for filename in filename_list:
    dir = 'data/' + filename
    thread = Thread(target = write_files, args =(dir,)) # new thread will run "write_files" with argument "dir"
    threads.append(thread)# to keep track of all the treads
for thread in threads:
    thread.start()
for thread in threads:
    thread.join() # Make sure all the threads are done before continuing

    
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")

Task data/data_part_1.txt: starting. 
Task data/data_part_2.txt: starting. 
Task data/data_part_3.txt: starting. 
Task data/data_part_4.txt: starting. 
Task data/data_part_5.txt: starting. 
Task data/data_part_6.txt: starting. 
Task data/data_part_1.txt: finished. 
Task data/data_part_3.txt: finished. 
Task data/data_part_2.txt: finished. 
Task data/data_part_7.txt: starting. 
Task data/data_part_4.txt: finished. 
Task data/data_part_5.txt: finished. 
Task data/data_part_6.txt: finished. 
Task data/data_part_8.txt: starting. 
Task data/data_part_9.txt: starting. 
Task data/data_part_7.txt: finished. 
Task data/data_part_10.txt: starting. 
Task data/data_part_8.txt: finished. 
Task data/data_part_10.txt: finished. 
Task data/data_part_9.txt: finished. 

Time spent inside the loop: 0.008241200004704297 seconds.


In [5]:
# 3 locked concurrency

from threading import RLock
rlock = RLock() # Needs to be outside the function. Created once, used by every thread.

def write_file_locked(dir):
    with rlock:
        print(f"Task {dir}: starting. ")
    with open('data/data_all.txt', 'a') as af:
        with open(dir,'r') as rf:
            af.write(rf.read())
    with rlock:
        print(f"Task {dir}: finishing. ")

In [6]:

start_time = perf_counter()

with open("data/data_all.txt",'w') as wf:# first delete the file incase there is some text already
        pass

threads = list()
for filename in filename_list:
    dir = "data/" + filename
    thread = Thread(target=write_file_locked, args=(dir,))
    threads.append(thread)

for thread in threads:
    thread.start()

for thread in threads:
    thread.join()
    
print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")  

Task data/data_part_1.txt: starting. 
Task data/data_part_2.txt: starting. 
Task data/data_part_3.txt: starting. 
Task data/data_part_4.txt: starting. 
Task data/data_part_5.txt: starting. 
Task data/data_part_6.txt: starting. 
Task data/data_part_1.txt: finishing. 
Task data/data_part_7.txt: starting. 
Task data/data_part_8.txt: starting. 
Task data/data_part_2.txt: finishing. 
Task data/data_part_3.txt: finishing. 
Task data/data_part_4.txt: finishing. 
Task data/data_part_9.txt: starting. 
Task data/data_part_5.txt: finishing. 
Task data/data_part_10.txt: starting. 
Task data/data_part_7.txt: finishing. 
Task data/data_part_6.txt: finishing. 
Task data/data_part_8.txt: finishing. 
Task data/data_part_10.txt: finishing. 
Task data/data_part_9.txt: finishing. 

Time spent inside the loop: 0.03057870001066476 seconds.


In [7]:

from multiprocessing import cpu_count

print(f"Number of CPU cores on my machine: {cpu_count()}")

Number of CPU cores on my machine: 8


In [8]:
#4 using multiprocess
from multiprocessing import Process

start_time = perf_counter()
processes = list()

with open("data/data_all.txt",'w') as wf:# first delete the file incase there is some text already
        pass

for filename in filename_list:
    dir = "/data" + filename
    print(dir)
    process = Process(target= write_files, args=(dir,))
    processes.append(process)

for process in processes:
    process.start()

for process in processes:
    process.join()

print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")


/datadata_part_1.txt
/datadata_part_2.txt
/datadata_part_3.txt
/datadata_part_4.txt
/datadata_part_5.txt
/datadata_part_6.txt
/datadata_part_7.txt
/datadata_part_8.txt
/datadata_part_9.txt
/datadata_part_10.txt

Time spent inside the loop: 0.19446020002942532 seconds.


In [9]:
#map workers and pools
#They are high-level tools to help you inject concurrency and parallelism into your code less painfully.

In [10]:
arguments = []
for fname in filename_list:
    arguments.append('data/'+ fname)
print(arguments)

start_time = perf_counter()
gen = map(write_files, arguments) # map is a generator (think yield)
tuple(gen) # do the actual work

print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")



['data/data_part_1.txt', 'data/data_part_2.txt', 'data/data_part_3.txt', 'data/data_part_4.txt', 'data/data_part_5.txt', 'data/data_part_6.txt', 'data/data_part_7.txt', 'data/data_part_8.txt', 'data/data_part_9.txt', 'data/data_part_10.txt']
Task data/data_part_1.txt: starting. 
Task data/data_part_1.txt: finished. 
Task data/data_part_2.txt: starting. 
Task data/data_part_2.txt: finished. 
Task data/data_part_3.txt: starting. 
Task data/data_part_3.txt: finished. 
Task data/data_part_4.txt: starting. 
Task data/data_part_4.txt: finished. 
Task data/data_part_5.txt: starting. 
Task data/data_part_5.txt: finished. 
Task data/data_part_6.txt: starting. 
Task data/data_part_6.txt: finished. 
Task data/data_part_7.txt: starting. 
Task data/data_part_7.txt: finished. 
Task data/data_part_8.txt: starting. 
Task data/data_part_8.txt: finished. 
Task data/data_part_9.txt: starting. 
Task data/data_part_9.txt: finished. 
Task data/data_part_10.txt: starting. 
Task data/data_part_10.txt: finishe

In [11]:
from concurrent.futures import ThreadPoolExecutor 

start_time = perf_counter()

with ThreadPoolExecutor() as pool: # Without arguments, ThreadPoolExecutor() will create as many workers as CPU cores + 4
    tuple(pool.map(write_files, arguments))

print(f"\nTime spent inside the loop: {perf_counter() - start_time} seconds.")

Task data/data_part_1.txt: starting. 
Task data/data_part_2.txt: starting. 
Task data/data_part_3.txt: starting. 
Task data/data_part_4.txt: starting. 
Task data/data_part_2.txt: finished. 
Task data/data_part_5.txt: starting. 
Task data/data_part_1.txt: finished. 
Task data/data_part_6.txt: starting. 
Task data/data_part_7.txt: starting. 
Task data/data_part_8.txt: starting. 
Task data/data_part_3.txt: finished. 
Task data/data_part_9.txt: starting. 
Task data/data_part_10.txt: starting. 
Task data/data_part_4.txt: finished. 
Task data/data_part_5.txt: finished. 
Task data/data_part_7.txt: finished. 
Task data/data_part_8.txt: finished. 
Task data/data_part_9.txt: finished. 
Task data/data_part_6.txt: finished. 
Task data/data_part_10.txt: finished. 

Time spent inside the loop: 0.009367499966174364 seconds.


In [13]:
from multiprocessing import Pool

start_time = perf_counter()
nb_workers = 8

with Pool(processes = nb_workers) as pool: # We fix the number of workers ourselve
     tuple(pool.map(write_files, arguments))
print(f"[{nb_workers} workers] Time spent inside the loop: {perf_counter() - start_time} seconds.")

### Exercise 2
Scrap all the web pages in the `urls` list and display the links. 1 thread per link. 

In [None]:
urls = [
    "http://www.python.org",
    "http://www.python.org/about/",
    "http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html",
    "http://www.python.org/doc/",
    "http://www.python.org/download/",
    "http://www.python.org/getit/",
    "http://www.python.org/community/",
    "https://wiki.python.org/moin/",
]