# Real-World Example: Multithreading for I/O-bound Tasks

### Scenario: Web Scraping

Web scraping often involves making numerous network requests to fetch web pages. These tasks are 
I/O-bound because they spend a lot of time waiting for responses from servers. Multithreading can
significantly improve the performance by allowing multiple web pages to be fetched concurrently.


In [14]:
# !pip install requests
# !pip install bs4

In [15]:
import threading
import requests
from bs4 import BeautifulSoup

URL1 = 'https://webscraper.io/test-sites/e-commerce/allinone'
URL2 = 'https://webscraper.io/test-sites/e-commerce/static'
URL3 = 'https://webscraper.io/test-sites/e-commerce/ajax'
URL4 = 'https://webscraper.io/test-sites/tables'


In [39]:
def getData(url):
    res  = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')

    # print( soup.title.text )

In [50]:
urls = [ URL1, URL2, URL3, URL4 ] * 30

def without_using_threads():
    print('started...')
    for i in urls:
        getData(i)
    print('ended...')


In [48]:

# using threads


def using_threads():
    threads = []

    for i in urls:
        tempThread = threading.Thread(target=getData, args=(i,))
        threads.append(tempThread)
        tempThread.start()
    for th in threads:
        th.join()

    print( f'{len(urls)} urls fetched...'  )



In [54]:
import time

# t1 = time.time()
# # without_using_threads()
# t2 = time.time()

# print(f'time spent without threads {t2 - t1}s \n')



t1 = time.time()
using_threads()
t2 = time.time()

print(f'time spent with threads  {t2 - t1}s \n')

120 urls fetched...
time spent with threads  2.0987133979797363s 



In [55]:
res = requests.get('https://www.geeksforgeeks.org/dijkstras-shortest-path-algorithm-greedy-algo-7/')

In [None]:
soup = BeautifulSoup(res.content, 'html.parser')
# soup

arr = dir(soup)
for i in arr : print(i)

ASCII_SPACES
DEFAULT_BUILDER_FEATURES
EMPTY_ELEMENT_EVENT
END_ELEMENT_EVENT
MAIN_CONTENT_STRING_TYPES
ROOT_TAG_NAME
START_ELEMENT_EVENT
STRING_ELEMENT_EVENT
_TreeTraversalEvent
__annotations__
__bool__
__call__
__class__
__contains__
__copy__
__deepcopy__
__delattr__
__delitem__
__dict__
__dir__
__doc__
__eq__
__firstlineno__
__format__
__ge__
__getattr__
__getattribute__
__getitem__
__getstate__
__gt__
__hash__
__init__
__init_subclass__
__iter__
__le__
__len__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__setitem__
__setstate__
__sizeof__
__static_attributes__
__str__
__subclasshook__
__unicode__
__weakref__
_all_strings
_decode_markup
_event_stream
_feed
_find_all
_find_one
_format_tag
_indent_string
_insert
_is_xml
_lastRecursiveChild
_last_descendant
_linkage_fixer
_markup_is_url
_markup_resembles_filename
_most_recent_element
_namespaces
_popToTag
_self_and
_should_pretty_print
append
attribute_value_list_class
attrs
builder
can_be_empty_element

In [87]:
anchors = soup.find_all('img')
print((anchors[0].get('src')))

# Print nicely
print(f"\n🔗 Found {len(anchors)} anchor tags:\n")
for i, tag in enumerate(anchors, start=1):
    link_text = tag.get_text(strip=True) or "(no text)"
    href = tag.get('src', '(no href)')
    print(f"{i:>2}. 🧷 Text: {link_text}\n    🌐 Href: {href}\n")

https://media.geeksforgeeks.org/gfg-gg-logo.svg

🔗 Found 13 anchor tags:

 1. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/gfg-gg-logo.svg

 2. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/auth-dashboard-uploads/Group-arrow.svg

 3. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/wp-content/uploads/20250409165423707485/1.webp

 4. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/auth/profile/sb7ciorr5k5t22woqkes

 5. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/wp-content/uploads/20250201111442430181/GATE-Rank-Booster-with-Expert-Curated-Questions.webp

 6. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/wp-content/uploads/20241230150834053749/GATE-CS_IT-2026.webp

 7. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/img-practice/prod/courses/451/Web/Content/cp_1723008864.webp

 8. 🧷 Text: (no text)
    🌐 Href: https://media.geeksforgeeks.org/auth-dashboard-uploads/gfgFooterLogo.png