### There are two libiaries we can use to send HTTP GET Request to retrieve HTML content: urllib and requests

In [20]:
# Requests libiary
# import packages
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Copy headers from the website, under Network - Headers - Request Headers
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
          'accept-language': 'en-CA,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'}

# Copy url from the website
ul = "https://toronto.craigslist.org/search/cta#search=1~gallery~0~0"

# Use the GET method to send requests
response = requests.get(url, headers = headers)

# We can check the return in text format by using response.text. Note the content is not parsed yet.
# Use BeautifulSoup method to parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
soup

In [24]:
# Urllib library
# import packages
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [25]:
# Extract website data
urltoOpen = urlopen("https://toronto.craigslist.org/search/cta#search=1~gallery~0~0")
soup = BeautifulSoup(urltoOpen, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="craigslist" property="og:site_name"/>
<meta content="preview" name="twitter:card"/>
<meta content="toronto cars &amp; trucks for sale - craigslist" property="og:title"/>
<meta content="toronto cars &amp; trucks for sale - craigslist" name="description"/>
<meta content="toronto cars &amp; trucks for sale - craigslist" property="og:description"/>
<meta content="https://toronto.craigslist.org/search/cta" property="og:url"/>
<title>toronto cars &amp; trucks for sale - craigslist</title>
<link href="https://toronto.craigslist.org/search/cta" rel="canonical"/>
<link href="https://toronto.craigslist.org/search/cta" hreflang="x-default" rel="alternate"/>
<link href="/favicon.ico" id="favicon" rel="icon">
<script id="ld_searchpage_data" type="application/ld+json">
    {"@context":"https://schema.org","br

In [34]:
# Get all the listing titles from current page and put them in a dataframe, use try except method to avoid breaking the loop when a tag is missing. 
titles = []
prices = []
locations = []
links = []

for div in soup.select('li'):
    try:
        title = div.select_one('div.title').text.strip()
    except:
        title = 'None'
    try:
        link = div.select_one('a[href]')
    except:
        link = ''
    try:
        price = div.select_one('div.price').text.strip()
    except:
        price = ''
    try:
        location = div.select_one('div.location').text.strip()
    except:
        location = ''
        
    titles.append(title) 
    links.append(link)
    prices.append(price)
    locations.append(location)
    
df = pd.DataFrame({'titles': titles, 'links':links, 'prices':prices, 'locations': locations})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     344 non-null    object
 1   links      343 non-null    object
 2   prices     344 non-null    object
 3   locations  344 non-null    object
dtypes: object(4)
memory usage: 10.9+ KB


In [36]:
# Get all the listing titles from current page and put them in a dataframe. Get rid of missing data.
titles = []
prices = []
locations = []
links = []

for div in soup.select('li'):
    if div.select_one('div.title') or div.select_one('a[href]') or div.select_one('div.price') or div.select_one('div.location'):
        
        title_tag = div.select_one('div.title')
        title = title_tag.text.strip().lower() if title_tag else None
        
        link_tag = div.select_one('a[href]')
        link = link_tag['href'] if link_tag else None
    
        price_tag = div.select_one('div.price')
        price = price_tag.text.strip() if price_tag else None
        
        location_tag = div.select_one('div.location')
        location = location_tag.text.strip().lower() if location_tag else None
            
        titles.append(title) 
        links.append(link)
        prices.append(price)
        locations.append(location)
    
df = pd.DataFrame({'titles': titles, 'links':links, 'prices':prices, 'locations': locations})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     343 non-null    object
 1   links      343 non-null    object
 2   prices     343 non-null    object
 3   locations  343 non-null    object
dtypes: object(4)
memory usage: 10.8+ KB


In [32]:
df.head()

Unnamed: 0,titles,links,prices,locations
0,2012 hyundai elantra gls,https://toronto.craigslist.org/tor/ctd/d/north...,"$6,950",north york
1,2013 dodge journey se,https://toronto.craigslist.org/tor/ctd/d/north...,"$6,399",north york
2,2016 dodge journey se seven passenger,https://toronto.craigslist.org/tor/ctd/d/north...,"$7,599",north york
3,2014 chevrolet cruze lt,https://toronto.craigslist.org/tor/ctd/d/north...,"$6,399",north york
4,2011 mazda mazda 2,https://toronto.craigslist.org/tor/ctd/d/north...,"$5,399",north york


In [11]:
# Two ways of using soup to get the links from the website
links = soup.select('a[href]')
for link in links:
    print(link['href'])

links = soup.find_all('a', href = Ture)
for link in links:
    print(link['href'])

#
/
https://toronto.craigslist.org/tor/ctd/d/north-york-2012-hyundai-elantra-gls/7800695369.html
https://toronto.craigslist.org/tor/ctd/d/north-york-2013-dodge-journey-se/7803161450.html
https://toronto.craigslist.org/tor/ctd/d/north-york-2016-dodge-journey-se-seven/7804661038.html
https://toronto.craigslist.org/tor/ctd/d/north-york-2014-chevrolet-cruze-lt/7806024035.html
https://toronto.craigslist.org/tor/ctd/d/north-york-2011-mazda-mazda/7806960352.html
https://toronto.craigslist.org/tor/ctd/d/scarborough-2019-chevrolet-corvette/7808930176.html
https://toronto.craigslist.org/drh/ctd/d/oshawa-2013-grand-caravan-crew-plus/7807882324.html
https://toronto.craigslist.org/tor/cto/d/etobicoke-2021-toyota-venza-xle-suv/7802989317.html
https://toronto.craigslist.org/tor/ctd/d/scarborough-2021-mercedes-benz-gla-45/7807334532.html
https://toronto.craigslist.org/tor/ctd/d/scarborough-2022-bmw-330i-xdrive-sport/7807649837.html
https://toronto.craigslist.org/tor/ctd/d/scarborough-2022-dodge-challe

## Note:

1. The website is dynamic (content loaded with JavaScript), soup.select() alone will not work because BeautifulSoup can only parse the static HTML returned by the server. Dynamic content is rendered after the page is loaded, often via AJAX requests or lazy loading.
2. To handle dynamic website, we have three methods:

### Option	When to Use	Tools Required	+ Difficulty
- Option 1: Selenium + BeautifulSoup | Dynamic content, no API	| Selenium, BeautifulSoup, Pandas	⭐⭐⭐ (Medium)
- Option 2: API request via requests |	API available (check XHR requests) | Requests, Pandas	⭐⭐ (Easy)
- Option 3: Hybrid (Selenium + API) | Dynamically-generated API URL	| Selenium, Requests, Pandas	⭐⭐⭐⭐ (Complex)


### Key Commands for CSS Selectors

Selector	Usage	Example
- ID Selector	-- Select by ID	--> #search-results-page-1
- Class Selector --	Select by class	--> .meta-line
- Child Selector --	Direct child selector -->	#search-results-page-1 > ol
- Descendant Selector --	All descendants -->	#search-results-page-1 ol div
- nth-child() --	Select specific child -->	ol > div:nth-child(1)
- Attribute Selector --	Select by attribute -->	div[class="meta-line"]