-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraping.py
95 lines (73 loc) · 2.74 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
import threading
# Global variable (for demonstration of resolution order)
GLOBAL_VERSION = "Scraper v1.0"
# Function to scrape a single URL
def scrape_page(url, verbose=False):
"""
Scrapes the page title from a given URL.
:param url: URL to scrape.
:param verbose: If True, prints extra debug info.
:return: Title of the page or "No Title Found".
"""
# Enclosing scope variable
local_info = "Scraping local info..."
def show_info():
# Access 'local_info' from enclosing scope
print("Enclosing info:", local_info)
# Access 'GLOBAL_VERSION' from global scope
print("Global version:", GLOBAL_VERSION)
# Show how we can reference different scopes
show_info()
# Fetch page
try:
response = requests.get(url)
except requests.RequestException:
return "Error Fetching Title"
# Lambda function to extract title text
extract_title = lambda soup_obj: soup_obj.title.string if soup_obj.title else "No Title Found"
soup = BeautifulSoup(response.text, "html.parser")
page_title = extract_title(soup)
if verbose:
print(f"Fetched {url} -> {page_title}")
return page_title
# Wrapper function to update the results dictionary
def update_results(url, results, verbose):
results[url] = scrape_page(url, verbose=verbose)
def run_scraper():
"""
Creates threads to scrape multiple URLs in parallel.
"""
# Our three sample URLs (feel free to replace them with any other public pages)
urls = [
"https://www.python.org/downloads/release/python-31016",
"https://techcrunch.com/2025/02/15/openai-teases-a-simplified-gpt-5-model/",
"https://daily.dev/blog/python-and-javascript-choosing-your-first-language"
]
# Allow user to input an extra URL
extra_url = input("Enter an extra URL (or leave blank to skip): ")
if extra_url.strip():
urls.append(extra_url.strip())
results = {}
threads = []
# Create a thread for each URL
for link in urls:
t = threading.Thread(target=update_results, args=(link, results, True))
threads.append(t)
# Start all threads
for t in threads:
t.start()
# Wait for all threads to finish
for t in threads:
t.join()
# Print the final results
success_count = sum(1 for title in results.values() if title != "Error Fetching Title")
error_count = len(results) - success_count
for link, title in results.items():
print(f"Title for {link}: {title}")
print(f"Scraped {success_count} titles successfully, encountered {error_count} errors.")
# Actual script entry point
if __name__ == "__main__":
# Running the scraper
run_scraper()