In [1]:
import pandas as pd
import numpy as np
import requests
import re
import json
from bs4 import BeautifulSoup
import time

In [2]:
locations = {
    "dong-nai": "Đồng Nai",
    "binh-duong": "Bình Dương",
    "tp-ho-chi-minh": "TP Hồ Chí Minh"
}

type_real_estates = {
    "mua-ban-can-ho-chung-cu": "Căn hộ/Chung cư",
    "mua-ban-nha-dat": "Nhà ở",
    "mua-ban-dat": "Đất",
}

attribute_map = [
    "ward",
    "area",
    "region",
    "address",
    "property_status",
    "price_m2",
    "direction",
    "balconydirection",
    "property_legal_document",
    "size",
    "floornumber",
    "apartment_type",
    "furnishing_sell",
    "apartment_feature",
    "rooms",
    "toilets",
    "floors",
    "house_type",
    "living_size",
    "width",
    "length",
    "land_type",
    "property_road_condition",
    "land_feature",
    "property_back_condition",
]

base_url = "https://nha.chotot.com/{}/{}?page={}"
api = "https://gateway.chotot.com/v1/public/ad-listing/{}"


In [3]:
def parse_data_api(type_name, id, data, json_text):
    json_data = json.loads(json_text)

    if "type_name" in json_data["ad"] and json_data["ad"]["type_name"] == "Cần mua":
        return

    data["type_real_estate"].append(type_name)
    data["id"].append(id)

    # Thuộc tính giá (price) và thời gian đăng tin (list_time) nằm trong ad
    if "ad" in json_data and "price" in json_data["ad"]:
        data["price"].append(json_data["ad"]["price"]
                             if "price" in json_data["ad"] else np.nan)
        data["list_time"].append(json_data["ad"]["list_time"]
                            if "list_time" in json_data["ad"] else np.nan)
    else:
        data["price"].append(np.nan)
        data["list_time"].append(np.nan)

    # Các thuộc tính trong parameter sẽ được thêm vào đây
    attr = ["id", "type_real_estate", "price", "list_time"]

    # Map các thuộc tính trong parameter vào dataset
    for para in json_data["parameters"]:
        if "id" in para:
            para_id = para["id"]
            if "value" in para and para_id in attribute_map:
                value = para["value"]
                data[para_id].append(value)
                attr.append(para_id)

    # Các thuộc tính missing đặt giá trị là nan
    for attribute in attribute_map:
        if attribute not in attr:
            data[attribute].append(np.nan)

    return data


In [4]:
data = {
    "id": [],
    "list_time": [],
    "type_real_estate": [],
    "ward": [],
    "area": [],
    "region": [],
    "address": [],
    "property_status": [],
    "price_m2": [],
    "direction": [],
    "balconydirection": [],
    "property_legal_document": [],
    "size": [],
    "floornumber": [],
    "apartment_type": [],
    "furnishing_sell": [],
    "apartment_feature": [],
    "rooms": [],
    "toilets": [],
    "floors": [],
    "house_type": [],
    "living_size": [],
    "width": [],
    "length": [],
    "land_type": [],
    "property_road_condition": [],
    "land_feature": [],
    "property_back_condition": [],
    "price": []
}


In [5]:
from http.client import RemoteDisconnected
from urllib3.exceptions import ProtocolError, NewConnectionError, MaxRetryError
from urllib.error import HTTPError
from socket import gaierror

In [6]:
total = 0
for location, location_name in locations.items():
    data = {
        "id": [],
        "list_time": [],
        "type_real_estate": [],
        "ward": [],
        "area": [],
        "region": [],
        "address": [],
        "property_status": [],
        "price_m2": [],
        "direction": [],
        "balconydirection": [],
        "property_legal_document": [],
        "size": [],
        "floornumber": [],
        "apartment_type": [],
        "furnishing_sell": [],
        "apartment_feature": [],
        "rooms": [],
        "toilets": [],
        "floors": [],
        "house_type": [],
        "living_size": [],
        "width": [],
        "length": [],
        "land_type": [],
        "property_road_condition": [],
        "land_feature": [],
        "property_back_condition": [],
        "price": []
    }

    print(f'{location_name}:')
    for type_real_estate, type_name in type_real_estates.items():
        count = 0
        page = 1
        while True:
            url = base_url.format(location, type_real_estate, page)
            try:
                result = requests.get(url)

                if result.status_code == 200:
                    soup = BeautifulSoup(result.text, "html.parser")
                    items = soup.find_all(
                        "a", class_="AdItem_adItem__2O28x", href=True)

            except (ConnectionError, ProtocolError, RemoteDisconnected, HTTPError,
                    gaierror, NewConnectionError, MaxRetryError) as e:
                time.sleep(90)
                print(e)
                continue

            # # Thoát khỏi vòng lặp khi đã crawl dữ liệu hết loại này
            if len(items) == 0:
                break

            for item in items:
                href = item["href"]
                id_extract = re.findall(r"(\d+).htm", href)

                if len(id_extract) == 0:
                    break

                while True:
                    try:
                        result_item = requests.get(api.format(id_extract[0]))
                        if result_item.status_code == 200:
                            json_text = result_item.text
                            parse_data_api(
                                type_name, id_extract[0], data, json_text)
                            count += 1
                            print("\r\tCrawled page {}: {} items".format(
                                page, count), end="", flush=True)
                            break

                    except (ConnectionError, ProtocolError, RemoteDisconnected, HTTPError,
                            gaierror, NewConnectionError, MaxRetryError) as e:
                        time.sleep(90)
                        print(e)
                        continue

            page += 1

        total += count
        print(f'\r\t-{type_name} ({page-1} pages): {count} items', flush=True)

        df = pd.DataFrame(data)
        df.to_csv(f'{location} {type_real_estate}.csv', index=False)

print(f'Total = {total} items')

Đồng Nai:
	Crawled page 2: 40 items

In [None]:
# df = pd.DataFrame(data)
# df.to_csv('raw data/raw data.csv', index=False)


In [None]:
for column in df.columns:
    print("Feature {}:".format(column))
    print(df[column].unique())
