In [22]:
import requests
import json
from bs4 import BeautifulSoup

In [23]:
cell_url = 'https://www.kijiji.ca/b-cell-phone/alberta/c760l9003?for-sale-by=ownr&view=list'

# mimics browser request - otherwise blocked
header = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/139.0.0.0 Safari/537.36"
}

srch_response = requests.get(cell_url, headers=header)

In [None]:
# INSPECT: results of search page
srch_soup = BeautifulSoup(srch_response.text)
print(srch_soup.prettify())

# Result of Inspection (Search Page)
- some of the html content is dynamic and requires scripts to run (i.e. dates)
- html element tags are random keys
    - data-testid can be used instead
- all the listings are in either top ads or search results
- to get search results 
    - data-testid="srp-search-list"
- contains information about total number of results
    - found in the first h2 element marked with data-testid="srp-results"

In [None]:
# get the first (only) unordered list containing all the ads
srch_listing_div = srch_soup.find_all("ul", attrs={"data-testid":"srp-search-list"})[0]

# parse the list
srch_all_listings = srch_listing_div.find_all('li')

# INSPECT: content of a listing
print(srch_all_listings[0].prettify())

# Result of inspection (Listing on Search Page)
1. Id
    - in section data-testid="listing-card" -> attribute data-listingid
2. Price
    - data-testid="listing-price-container" inside p block in div
3. Title
    - data-testid="listing-title"
4. Location
    - data-testid="listing-location"
5. Description
    - data-testid="listing-description"
    - partial description
6. Link to posting
    - data-testid="listing-link"

***Time is missing - this page cannot be used as a stand-alone***

In [28]:
# retrieved url of a posting - get request
listing_url = "https://www.kijiji.ca/v-cell-phone/calgary/iphone-14/1722633209"
listing_response = requests.get(listing_url, headers=header)

In [None]:
# INSPECT: a post from the listing
listing_soup = BeautifulSoup(listing_response.text)
print(listing_soup.prettify())

# Result of Inspection (Single Listing)
- time is still produced using a script
- the header contains a script call with all important data in Json format
- located at \<script type="application/ld+json">
    - @type: listing type (cell phones and general are 'Product')
    - name: title of the ad
    - description: entire description (not truncated)
    - offers: contains information about the sale (nested dictionary)
        - price
        - priceCurrency
        - validFrom
        - validThrough
        - availableAtOrFrom
            - latitude
            - longitude
            - name: city name
        - address
            - streetAddress
            - addressLocality
            - addressCountry

In [31]:
listing_txt = listing_soup.head.find_all_next("script", attrs={'type':'application/ld+json'})[0].text
listing_json = json.loads(listing_txt)

In [33]:
print("Type\n-----------\n" + listing_json["@type"])
print("\nTitle\n-----------\n" + listing_json['name'])
print("\nDescription\n-----------\n" + listing_json['description'])
print("\nPrice\n-----------\n" + listing_json['offers']['price'])
print("\nCurrency\n--------------\n" + listing_json['offers']['priceCurrency'])
print("\nPosting Date\n-----------\n" + listing_json['offers']['validFrom'])
print("\nAddress\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['streetAddress'])
print("\nCity\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['addressLocality'])
print("\nCountry\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['addressCountry'])
print("\nLat-Long\n-----------\n{}, {}".format(
    listing_json['offers']['availableAtOrFrom']['latitude'],
    listing_json['offers']['availableAtOrFrom']['longitude']
))

Type
-----------
Product

Title
-----------
iPhone 14

Description
-----------
iPhone 14 128gb nothing wrong with it willing to negotiate

Price
-----------
600

Currency
--------------
CAD

Posting Date
-----------
2025-08-11

Address
-----------
Edgebrook Rise NW, Calgary, T3A 5J9

City
-----------
Calgary

Country
-----------
Canada

Lat-Long
-----------
51.13, -114.12


# Motorcycles

In [34]:
HEADER = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/139.0.0.0 Safari/537.36"
}

bike_page = requests.get(
    "https://www.kijiji.ca/v-sport-bikes/calgary/motorcycle-boots/1692095045",
    headers=HEADER
)

In [None]:
bike_soup  = BeautifulSoup(bike_page.text)
bike_json = json.loads(bike_soup.find_all("script", attrs={"type": "application/ld+json"})[0].text)

In [None]:
# --- Generic Product info
print(f"{bike_json["name"]=}")
print(f"{bike_json["description"]=}")
print(f"{bike_json["offers"]["price"]=}")
print(f"{bike_json["offers"]["priceCurrency"]=}")
print(f"{bike_json["offers"]["validFrom"]=}")
print(f"{bike_json["offers"]["validThrough"]=}")
print(f"{bike_json["offers"]["availableAtOrFrom"]["latitude"]=}")
print(f"{bike_json["offers"]["availableAtOrFrom"]["longitude"]=}")
print(f"{bike_json["offers"]["availableAtOrFrom"]["address"]["streetAddress"]=}")
print(f"{bike_json["offers"]["availableAtOrFrom"]["address"]["addressLocality"]=}")
print(f"{bike_json["offers"]["availableAtOrFrom"]["address"]["addressCountry"]=}")
print("------")
# --- Bike Specific
print(f"{bike_json["@type"]=}")
print(f"{bike_json["brand"]["name"]=}")
print(f"{bike_json["mileageFromOdometer"]["value"]=}")
print(f"{bike_json["mileageFromOdometer"]["unitCode"]=}")
print(f"{bike_json["model"]=}")
print(f"{bike_json["vehicleModelDate"]=}")

bike_json["name"]='Motorcycle Boots'
bike_json["description"]='Sidi Leather Motorcycle Boots Leather Short Velcro closure.\r\nWaterproof \r\nSize 44 (or mens size 10) \r\n$190'
bike_json["offers"]["price"]='190'
bike_json["offers"]["priceCurrency"]='CAD'
bike_json["offers"]["validFrom"]='2024-04-29'
bike_json["offers"]["validThrough"]='2024-05-29'
bike_json["offers"]["availableAtOrFrom"]["latitude"]=51.13
bike_json["offers"]["availableAtOrFrom"]["longitude"]=-114.24
bike_json["offers"]["availableAtOrFrom"]["address"]["streetAddress"]='Calgary, AB T3L 2A5'
bike_json["offers"]["availableAtOrFrom"]["address"]["addressLocality"]='Calgary'
bike_json["offers"]["availableAtOrFrom"]["address"]["addressCountry"]='Canada'
bike_json["offers"]["availableAtOrFrom"]["address"]["addressCountry"]='Canada'
------
bike_json["@type"]='Motorcycle'
bike_json["brand"]["name"]='Suzuki'
bike_json["mileageFromOdometer"]["value"]='1'
bike_json["mileageFromOdometer"]["unitCode"]='KMT'
bike_json["model"]='GSX-R'


# Plan
For now, I am not concern with updates to old listings.
- NOTE: to do this just inspect etag on existing listing

## Initial Scraper
- this will go through all the listings in a search url and gather all the data and save to json file