In [58]:
import requests
import json
from bs4 import BeautifulSoup

In [59]:
srch_url = 'https://www.kijiji.ca/b-cell-phone/alberta/c760l9003?for-sale-by=ownr&view=list'

# mimics browser request - otherwise blocked
header = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/139.0.0.0 Safari/537.36"
}

srch_response = requests.get(srch_url, headers=header)

In [None]:
# INSPECT: results of search page
srch_soup = BeautifulSoup(srch_response.text)
print(srch_soup.prettify())

# Result of Inspection (Search Page)
- some of the html content is dynamic and requires scripts to run (i.e. dates)
- html element tags are random keys
    - data-testid can be used instead
- all the listings are in either top ads or search results
- to get search results 
    - data-testid="srp-search-list"
- contains information about total number of results
    - found in the first h2 element marked with data-testid="srp-results"

In [None]:
# get the first (only) unordered list containing all the ads
srch_listing_div = srch_soup.find_all("ul", attrs={"data-testid":"srp-search-list"})[0]

# parse the list
srch_all_listings = srch_listing_div.find_all('li')

# INSPECT: content of a listing
print(srch_all_listings[0].prettify())

# Result of inspection (Listing on Search Page)
1. Id
    - in section data-testid="listing-card" -> attribute data-listingid
2. Price
    - data-testid="listing-price-container" inside p block in div
3. Title
    - data-testid="listing-title"
4. Location
    - data-testid="listing-location"
5. Description
    - data-testid="listing-description"
    - partial description
6. Link to posting
    - data-testid="listing-link"

***Time is missing - this page cannot be used as a stand-alone***

In [62]:
# retrieved url of a posting - get request
listing_url = "https://www.kijiji.ca/v-cell-phone/calgary/galaxy-s22-ultra-128gb-unlocked/1722534139"
listing_response = requests.get(listing_url, headers=header)

In [None]:
# INSPECT: a post from the listing
listing_soup = BeautifulSoup(listing_response.text)
print(listing_soup.prettify())

# Result of Inspection (Single Listing)
- time is still produced using a script
- the header contains a script call with all important data in Json format
- located at \<script type="application/ld+json">
    - name: title of the ad
    - description: entire description (not truncated)
    - offers: contains information about the sale (nested dictionary)
        - price
        - priceCurrency
        - validFrom
        - validThrough
        - availableAtOrFrom
            - latitude
            - longitude
            - name: city name
        - address
            - streetAddress
            - addressLocality
            - addressCountry

In [64]:
listing_txt = listing_soup.head.find_all_next("script", attrs={'type':'application/ld+json'})[0].text
listing_json = json.loads(listing_txt)

In [65]:
print("Title\n-----------\n" + listing_json['name'])
print("\nDescription\n-----------\n" + listing_json['description'])
print("\nPrice\n-----------\n" + listing_json['offers']['price'])
print("\nCurrency\n--------------\n" + listing_json['offers']['priceCurrency'])
print("\nPosting Date\n-----------\n" + listing_json['offers']['validFrom'])
print("\nAddress\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['streetAddress'])
print("\nCity\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['addressLocality'])
print("\nCountry\n-----------\n" + listing_json['offers']['availableAtOrFrom']['address']['addressCountry'])
print("\nLat-Long\n-----------\n{}, {}".format(
    listing_json['offers']['availableAtOrFrom']['latitude'],
    listing_json['offers']['availableAtOrFrom']['longitude']
))

Title
-----------
Galaxy S22 Ultra, 128GB, Unlocked.

Description
-----------
Samsung Galaxy S22 Ultra, Burgundy, 128GB, Unlocked.
Screen protector and case included.

We Sell many iPhones and Androids at the reasonable price in the store 
(PLEASE, Check 'View poster's other ads' on my phone number)
All phones for sale have a brand new Tempered Glass Screen Protector

A. Can you check the inside if there was water damage or not? 
B. What about the IMEI if it is blacklisted or not? 
C. Some people sell a phone which still has a contract, and they will not pay the bill for a while 
     --- the phone must be blacklisted! 
D. Someone reports the phone to the insurance company as a stolen or lost to take the compensation after selling. 
     --- the phone was working when you buy this, but that will be black listed after that!
Don't worry about all of these things! 
We have already checked all of those, and guarantee it! 
We give you our store warranty also!
E. All phones we sell are not r

# Plan
For now, I am not concern with updates to old listings.
- NOTE: to do this just inspect etag on existing listing

## Initial Scraper
- this will go through all the listings in a search url and gather all the data

## Update Scraper
- this will run once a day and scrape until the listings posted on that day are depleted