In [24]:
import requests
import json
from bs4 import BeautifulSoup

In [11]:
# mimics browser request - blocked otherwise
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/139.0.0.0 Safari/537.36"
}

srch_response = requests.get('https://www.kijiji.ca/b-cell-phone/alberta/c760l9003?for-sale-by=ownr&view=list', headers=header)


In [None]:
# inspect results of search page
srch_soup = BeautifulSoup(srch_response.text)
print(srch_soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="https://www.kijiji.ca/next-assets/images/fb_desktop.jpg" property="og:image:secure_url"/>
  <title>
   Buy New and Used Cell Phones &amp; Smartphones in Alberta  | Free Local Classifieds - Kijiji
  </title>
  <meta content="index,follow" name="robots"/>
  <meta content="Need a new phone? Find the best deals on new and used phones in Alberta- iPhone, Android, Samsung, Apple, LG, Google, HTC and more on Kijiji, Canada's #1 Local Marketplaces." name="description"/>
  <meta content="Buy New and Used Cell Phones &amp; Smartphones in Alberta  | Free Local Classifieds - Kijiji" property="og:title"/>
  <meta content="Need a new phone? Find the best deals on new and used phones in Alberta- iPhone, Android, Samsung, Apple, LG, Google, HTC and more on Kijiji, Canada's #1 Local Marketplaces." property="og:description"/>
  <meta content="https://www.kijiji.ca/b-

# Result of Inspection (Search Page)
- some of the html content is dynamic and requires scripts to run (i.e. dates)
- html element tags are random keys
    - data-testid can be used instead
- all the listings are in either top ads or search results
- to get search results 
    - data-testid="srp-search-list"
- contains information about total number of results
    - found in the first h2 element marked with data-testid="srp-results"

In [13]:
# get the unordered list containing all the ads
srch_listing_div = srch_soup.find_all("ul", attrs={"data-testid":"srp-search-list"})[0]
srch_all_listings = srch_listing_div.find_all('li')

In [14]:
# INSPECT:content of a listing
print(srch_all_listings[0].prettify())

<li data-testid="listing-card-list-item-0" style="list-style-type:none">
 <section class="sc-336af988-3 sc-336af988-4 jbIDcC iKCkQ" data-listingid="1721972369" data-testid="listing-card">
  <div class="sc-336af988-8 bCfWMb">
   <div class="sc-e1abb725-0 kLNsQi sc-336af988-6 iUBLnl" data-testid="listing-card-image-container">
    <img alt="I'm selling a used unlocked Samsung Galaxy Z Fold6 with 512GB of storage, complete with its original..." data-testid="listing-card-image" fetchpriority="low" height="400" loading="lazy" src="https://media.kijiji.ca/api/v1/ca-prod-fsbo-ads/images/45/45d533fb-11c1-45fd-b3b8-7fd2234296b2?rule=kijijica-400-webp"/>
   </div>
   <div class="sc-eb45309b-0 kafBMp sc-336af988-11 bRtdjE">
    <div class="sc-336af988-10 eEFvtv">
     <div class="sc-54de28bc-0 gSiPgX" data-testid="listing-price-container">
      <p class="sc-991ea11d-0 hahwcu sc-54de28bc-2 gPSghC" data-testid="listing-price">
       $1,200.00
      </p>
     </div>
     <h3 class="sc-991ea11d-0 e

# Result of inspection (Listing on Search Page)
1. Id
    - in section data-testid="listing-card" -> attribute data-listingid
2. Price
    - data-testid="listing-price-container" inside p block in div
3. Title
    - data-testid="listing-title"
4. Location
    - data-testid="listing-location"
5. Description
    - data-testid="listing-description"
    - partial description
6. Link to posting
    - data-testid="listing-link"

***Time is missing - this page cannot be used as a stand-alone***

In [15]:
# go into a listing and inspect
listing_response = requests.get("https://www.kijiji.ca/v-cell-phone/calgary/galaxy-s22-ultra-128gb-unlocked/1722534139", headers=header)
listing_soup = BeautifulSoup(listing_response.text)
print(listing_soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="https://www.kijiji.ca/next-assets/images/fb_desktop.jpg" property="og:image:secure_url"/>
  <script type="application/ld+json">
   {"@context":"https://schema.org","@type":"Product","name":"Galaxy S22 Ultra, 128GB, Unlocked.","description":"Samsung Galaxy S22 Ultra, Burgundy, 128GB, Unlocked.\r\nScreen protector and case included.\r\n\r\nWe Sell many iPhones and Androids at the reasonable price in the store \r\n(PLEASE, Check 'View poster's other ads' on my phone number)\r\nAll phones for sale have a brand new Tempered Glass Screen Protector\r\n\r\nA. Can you check the inside if there was water damage or not? \r\nB. What about the IMEI if it is blacklisted or not? \r\nC. Some people sell a phone which still has a contract, and they will not pay the bill for a while \r\n     --- the phone must be blacklisted! \r\nD. Someone reports the phone to the i

# Result of Inspection (Single Listing)
- time is still produced using a script
- the header contains a script call with all important data in Json format
- located at \<script type="application/ld+json">
    - name: title of the ad
    - description: entire description (not truncated)
    - offers: contains information about the sale (nested dictionary)
        - price
        - priceCurrency
        - validFrom
        - validThrough
        - availableAtOrFrom
            - latitude
            - longitude
            - name: city name
        - address
            - streetAddress
            - addressLocality
            - addressCountry

In [None]:
listing_json = json.loads(listing_soup.head.find_all_next("script", attrs={'type':'application/ld+json'})[0].text)
print(f"Title:\n---\n {listing_json['name']}")
print(f"Description:\n---\n {listing_json['description']}")
print(f"Price:\n---\n {listing_json['offers']['price']}")
print(f"Currency:\n---\n {listing_json['offers']['priceCurrency']}")
print(f"Posting Date:\n---\n {listing_json['offers']['validFrom']}")
print(f"Lat-Long:\n---\n {listing_json['offers']['availableAtOrFrom']['latitude']}, {listing_json['offers']['availableAtOrFrom']['longitude']}")
print(f"Address:\n---\n {listing_json['offers']['availableAtOrFrom']['address']['streetAddress']}")
print(f"City:\n---\n {listing_json['offers']['availableAtOrFrom']['address']['addressLocality']}")
print(f"Country:\n---\n {listing_json['offers']['availableAtOrFrom']['address']['addressCountry']}")

Title:
---
 Galaxy S22 Ultra, 128GB, Unlocked.
Description:
---
 Samsung Galaxy S22 Ultra, Burgundy, 128GB, Unlocked.
Screen protector and case included.

We Sell many iPhones and Androids at the reasonable price in the store 
(PLEASE, Check 'View poster's other ads' on my phone number)
All phones for sale have a brand new Tempered Glass Screen Protector

A. Can you check the inside if there was water damage or not? 
B. What about the IMEI if it is blacklisted or not? 
C. Some people sell a phone which still has a contract, and they will not pay the bill for a while 
     --- the phone must be blacklisted! 
D. Someone reports the phone to the insurance company as a stolen or lost to take the compensation after selling. 
     --- the phone was working when you buy this, but that will be black listed after that!
Don't worry about all of these things! 
We have already checked all of those, and guarantee it! 
We give you our store warranty also!
E. All phones we sell are not refurbished wh

# Plan
In order to ensure the scrapper is minimally intrusive. The scraper will traverse search pages, then only inspect the header
from each page. This should save a lot of bandwidth.

For now, I am not concern with updates to old listings.
- NOTE: to do this just inspect etag on existing listing

## Initial Scraper
- this will go through all the listings in a search url and gather all the data

## Update Scraper
- this will run once a day and scrape until the listings posted on that day are depleted