# Web Scraper with Heavy Post-Processing
🧠 Project Goals
1. Asynchronously fetch HTML content from a list of URLs.
1. Save raw HTML temporarily (or keep in memory).
1. Use multiprocessing to parse HTML (e.g., extract title, links, text).
1. Log progress and performance.

`tqdm` is a powerful, lightweight, and easy-to-use progress bar tool.
✅ Інсталяція бібліотек

In [None]:
%pip install tqdm ipywidgets

### простий приклад використання

In [None]:
from tqdm.notebook import tqdm
# for scripts
#from tqdm import tqdm
import time

#items = range(10)
items = ["apple", "banana", "orange", "kiwi"]

# якщо є прінт то статус бар перестворюється
for i in tqdm(items, desc="Статус бар у циклі"):
    time.sleep(0.5)
    print(f"Опрацьовуємо: {i}")

# краще робити з лісткомпрехенш
squared = [x*x for x in tqdm(range(1000))]

### використання багатопоточності

In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import math


def task(x):
    time.sleep(0.5)
    return math.sqrt(x^3) * math.exp(x)

with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(tqdm(executor.map(task, range(20)), total=20))


### приклад відвідування сайту який має затримку 1с

In [9]:
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import time

# --- Simulate a URL fetch ---
def fetch_url(url):
    try:
        response = requests.get(url, timeout=5)
        return (url, response.status_code)
    except Exception as e:
        return (url, str(e))

# --- Wrapper for tqdm with multiprocessing ---
def parallel_scrape(urls, workers=4):
    with ThreadPoolExecutor(max_workers=workers) as executor:
        results = list(tqdm(executor.map(fetch_url, urls), total=len(urls), desc="Scraping"))
    return results

# --- Usage example ---
if __name__ == "__main__":
    # Simulate test URLs (you can replace these with real ones)
    base_url = "https://httpbin.org/delay/1"  # Simulates 1 second delay
    urls = [base_url for _ in range(20)]  # 20 URLs

    start = time.time()
    results = parallel_scrape(urls, workers=5)
    end = time.time()

    print(f"\n✅ Done in {end - start:.2f} seconds.\n")

    # Show results
    for url, status in results:
        print(f"{url} -> {status}")


Scraping:   0%|          | 0/20 [00:00<?, ?it/s]


✅ Done in 7.63 seconds.

https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
https://httpbin.org/delay/1 -> 200
