<h1 style="font-weight:600; font-size:36px; color:#1e88e5;">  
    Task 1: Data Collection and Web Scraping
</span>
<span style="font-size:22px; font-weight:600; color:#555;">
</h1>

<h1 style="font-weight:600; font-size:16px; color:#1e88e5;">  
    Static Web Scraping
</span>
<span style="font-size:22px; font-weight:600; color:#555;">
</h1>

In [19]:
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd

In [20]:
# URL to scrape (example website)
URL = "https://realpython.github.io/fake-jobs/"

In [21]:
# Send request
response = requests.get(URL)
response.raise_for_status()

In [22]:
# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

In [23]:
# Locate all result container
results = soup.find(id='ResultsContainer')

In [24]:
# Extract Job Cards
job_cards = results.find_all('div', class_='card-content')

In [25]:
# Extract Job Information
jobs = []
for job in job_cards:
    title = job.find('h2', class_='title').text.strip()
    company = job.find('h3', class_='company').text.strip()
    location = job.find('p', class_='location').text.strip()
    jobs.append ({'title': title, 'company': company, 'location': location})
jobs[:5]

[{'title': 'Senior Python Developer',
  'company': 'Payne, Roberts and Davis',
  'location': 'Stewartbury, AA'},
 {'title': 'Energy engineer',
  'company': 'Vasquez-Davidson',
  'location': 'Christopherville, AA'},
 {'title': 'Legal executive',
  'company': 'Jackson, Chambers and Levy',
  'location': 'Port Ericaburgh, AA'},
 {'title': 'Fitness centre manager',
  'company': 'Savage-Bradley',
  'location': 'East Seanview, AP'},
 {'title': 'Product manager',
  'company': 'Ramirez Inc',
  'location': 'North Jamieview, AP'}]

In [26]:
 #Filter Python Jobs

python_jobs = [job for job in jobs if 'python' in job['title'].lower()]
len(python_jobs)
# print(python_jobs)

10

In [27]:
# Save Data to Csv

import csv
with open('jobs.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'company', 'location'])
    writer.writeheader()
    writer.writerows(python_jobs)
df = pd.DataFrame(python_jobs)
df

Unnamed: 0,title,company,location
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA"
1,Software Engineer (Python),Garcia PLC,"Ericberg, AE"
2,Python Programmer (Entry-Level),"Moss, Duncan and Allen","Port Sara, AE"
3,Python Programmer (Entry-Level),Cooper and Sons,"West Victor, AE"
4,Software Developer (Python),Adams-Brewer,"Brockburgh, AE"
5,Python Developer,Rivera and Sons,"East Michaelfort, AA"
6,"Back-End Web Developer (Python, Django)",Stewart-Alexander,"South Kimberly, AA"
7,"Back-End Web Developer (Python, Django)","Jackson, Ali and Mckee","New Elizabethside, AA"
8,Python Programmer (Entry-Level),Mathews Inc,"Robertborough, AP"
9,Software Developer (Python),Moreno-Rodriguez,"Martinezburgh, AE"


<h1 style="font-weight:600; font-size:16px; color:#1e88e5;">  
    Dynamic Web Scraping
</span>
<span style="font-size:22px; font-weight:600; color:#555;">
</h1>

In [78]:
# dynamic_scraper.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv

In [79]:
# Setup Selenium 
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background (no window)
chrome_options.add_argument("--disable-gpu")

In [80]:
# Initialize driver
driver = webdriver.Chrome(options=chrome_options)

In [81]:
# Open the dynamic website
url = "https://quotes.toscrape.com/js/"
driver.get(url)

In [82]:
# Wait until quotes are loaded (max 10 seconds)
wait = WebDriverWait(driver, 10)
quotes_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "quote")))

In [83]:
# Scrape Quotes 
quotes = []
for q in quotes_elements:
    text = q.find_element(By.CLASS_NAME, "text").text
    author = q.find_element(By.CLASS_NAME, "author").text
    quotes.append({"quote": text, "author": author})

In [88]:

# Save to CSV 
# with open("quotes.csv", "w", newline="", encoding="utf-8") as f:
#     writer = csv.DictWriter(f, fieldnames=["quote", "author"])
#     writer.writeheader()
#     writer.writerows(quotes)

# print(f"Scraped {len(quotes)} quotes!")
# driver.quit
# csv_file = "quotes.csv"
# df = pd.read_csv(csv_file)
# df
# print(df.to_string(index=False))

csv_file = "quotes.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["quote", "author"])
    writer.writeheader()
    writer.writerows(quotes)

print(f"Scraped {len(quotes)} quotes!")

# ----- Quit Selenium -----
driver.quit()  # <-- added parentheses

# ----- Read CSV and print nicely -----
df = pd.read_csv(csv_file)

# Use tabulate to format the DataFrame
table = tabulate(df, headers='keys', tablefmt='fancy_grid', showindex=False)
print("\nThis is for simple lookup DataFrame Table:\n")
print(table)

Scraped 10 quotes!

This is for simple lookup DataFrame Table:

╒═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤═══════════════════╕
│ quote                                                                                                                               │ author            │
╞═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪═══════════════════╡
│ “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”                 │ Albert Einstein   │
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────────┤
│ “It is our choices, Harry, that show what we truly are, far more than our abilities.”                                               │ J.K. Rowling      │
