## Let's crawl

In [2]:
%pip install undetected-chromedriver selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Crawl alonhadat.com.vn using undetected-chromedriver from page 2 to 500

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import json
import os
import random

def random_sleep(min_seconds, max_seconds):
    time.sleep(random.uniform(min_seconds, max_seconds))

def crawl_alonhadat_selenium(page_num, driver):
    url = f"https://alonhadat.com.vn/nha-dat/can-ban/nha-dat/1/ha-noi/trang--{page_num}.html"
    driver.get(url)
    random_sleep(2, 4)  # Wait for page to load

    items = driver.find_elements(By.CSS_SELECTOR, "div.content-item")
    results = []
    for item in items:
        try:
            title_tag = item.find_element(By.CSS_SELECTOR, "div.ct_title a")
            title = title_tag.text.strip()
            link = "https://alonhadat.com.vn" + title_tag.get_attribute("href")

            date = item.find_element(By.CSS_SELECTOR, "div.ct_date").text.strip()

            try:
                area = item.find_element(By.CSS_SELECTOR, "div.ct_dt").text.strip().replace("Diện tích:", "")
            except NoSuchElementException:
                area = ""

            try:
                price = item.find_element(By.CSS_SELECTOR, "div.ct_price").text.strip().replace("Giá:", "")
            except NoSuchElementException:
                price = ""

            try:
                floors = item.find_element(By.CSS_SELECTOR, "span.floors").get_attribute("title")
            except NoSuchElementException:
                floors = ""

            try:
                bedrooms = item.find_element(By.CSS_SELECTOR, "span.bedroom").get_attribute("title")
            except NoSuchElementException:
                bedrooms = ""

            try:
                address = item.find_element(By.CSS_SELECTOR, "div.ct_dis").text.strip()
            except NoSuchElementException:
                address = ""

            results.append({
                "title": title,
                "url": link,
                "date": date,
                "area": area,
                "price": price,
                "floors": floors,
                "bedrooms": bedrooms,
                "address": address,
            })
        except Exception as e:
            print(f"Error parsing item: {e}")
    return results

# --- Setup undetected-chromedriver ---
chrome_options = uc.ChromeOptions()
#chrome_options.add_argument("--headless")  # Remove if you want to see the browser
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1280,720")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = uc.Chrome(options=chrome_options, version_main=136)

# --- Crawl pages and save as JSONL ---
output_folder = "../Datasets/alonhadat.com/json_new"
os.makedirs(output_folder, exist_ok=True)
start_page = 1319
end_page = 1500

for page_num in range(start_page, end_page + 1):
    print(f"Crawling page {page_num}...")
    while (1):
        data = crawl_alonhadat_selenium(page_num, driver)
        if data:
            output_file = os.path.join(output_folder, f"page_{page_num}.jsonl")
            with open(output_file, "w", encoding="utf-8") as file:
                for d in data:
                    file.write(json.dumps(d, ensure_ascii=False) + "\n")
            print(f"Saved {len(data)} items to {output_file}")
            break
        else:
            print(f"No data found for page {page_num}")
            random_sleep(3, 3)
    driver.execute_script("window.scrollTo(0,0);")  # Scroll to the top of the page
    if page_num % 10 == 0:
        print("Taking a break...")
        random_sleep(5, 10)  # Take a break every 10 pages
    else:
        random_sleep(1, 3)  # Be polite to the server

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Scroll to the bottom
    

driver.quit()

Crawling page 1319...
No data found for page 1319
No data found for page 1319
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1319.jsonl
Crawling page 1320...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1320.jsonl
Taking a break...
Crawling page 1321...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1321.jsonl
Crawling page 1322...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1322.jsonl
Crawling page 1323...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1323.jsonl
Crawling page 1324...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1324.jsonl
Crawling page 1325...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1325.jsonl
Crawling page 1326...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1326.jsonl
Crawling page 1327...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1327.jsonl
Crawling page 1328...
Saved 20 items to ../Datasets/alonhadat.com/json_new\page_1328.jsonl
Crawling page 13

InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=136.0.7103.114)
Stacktrace:
	GetHandleVerifier [0x00BDFC83+61635]
	GetHandleVerifier [0x00BDFCC4+61700]
	(No symbol) [0x00A005D3]
	(No symbol) [0x009EFE20]
	(No symbol) [0x00A0DD1F]
	(No symbol) [0x00A73E8C]
	(No symbol) [0x00A8DF19]
	(No symbol) [0x00A6D096]
	(No symbol) [0x00A3C840]
	(No symbol) [0x00A3D6A4]
	GetHandleVerifier [0x00E645A3+2701795]
	GetHandleVerifier [0x00E5FD26+2683238]
	GetHandleVerifier [0x00E7AA6E+2793134]
	GetHandleVerifier [0x00BF6945+155013]
	GetHandleVerifier [0x00BFD02D+181357]
	GetHandleVerifier [0x00BE74D8+92440]
	GetHandleVerifier [0x00BE7680+92864]
	GetHandleVerifier [0x00BD2070+5296]
	BaseThreadInitThunk [0x766B7BA9+25]
	RtlInitializeExceptionChain [0x775FC0CB+107]
	RtlClearBits [0x775FC04F+191]
