In [1]:
import pandas as pd
import numpy as np
import requests
import re
import json
from bs4 import BeautifulSoup

In [3]:
locations = {
    "tp-ho-chi-minh": "TP Hồ Chí Minh",
    "binh-duong": "Bình Dương",
    "dong-nai": "Đồng Nai",
}

type_real_estates = {
    "mua-ban-can-ho-chung-cu": "Căn hộ/Chung cư",
    "mua-ban-nha-dat": "Nhà ở",
    "mua-ban-dat": "Đất",
}

TOTAL_PAGE = 10

attribute_map = [
    "id",
    "type_real_estate",
    "unitnumber",
    "ward",
    "area",
    "region",
    "address",
    "property_status",
    "price_m2",
    "direction",
    "balconydirection",
    "property_legal_document",
    "size",
    "block",
    "price",
    "floornumber",
    "apartment_type",
    "furnishing_sell",
    "apartment_feature",
    "rooms",
    "toilets",
    "floors",
    "house_type",
    "living_size",
    "width",
    "length",
    "land_type",
    "property_road_condition",
    "land_feature",
    "property_back_condition",
]

base_url = "https://nha.chotot.com/{}/{}?page={}"
api = "https://gateway.chotot.com/v1/public/ad-listing/{}"


In [8]:
def parse_data_api(type_name, id, data, json_text):
    json_data = json.loads(json_text)

    if "type_name" in json_data["ad"] and json_data["ad"]["type_name"] == "Cần mua":
        return

    data["type_real_estate"].append(type_name)
    data["id"].append(id)
    data["price"].append(json_data["ad"]["price"])

    attr = ["id", "type_real_estate", "price"]
    for para in json_data["parameters"]:
        if "id" in para:
            para_id = para["id"]
            if "value" in para and para_id in attribute_map:
                value = para["value"]
                data[para_id].append(value)
                attr.append(para_id)
    for attribute in attribute_map:
        if attribute not in attr:
            data[attribute].append(np.nan)
    return data


In [9]:
data = {
    "id": [],
    "type_real_estate": [],
    "unitnumber": [],
    "ward": [],
    "area": [],
    "region": [],
    "address": [],
    "property_status": [],
    "price_m2": [],
    "direction": [],
    "balconydirection": [],
    "property_legal_document": [],
    "size": [],
    "block": [],
    "price": [],
    "floornumber": [],
    "apartment_type": [],
    "furnishing_sell": [],
    "apartment_feature": [],
    "rooms": [],
    "toilets": [],
    "floors": [],
    "house_type": [],
    "living_size": [],
    "width": [],
    "length": [],
    "land_type": [],
    "property_road_condition": [],
    "land_feature": [],
    "property_back_condition": [],
}

count = 0
for location, location_name in locations.items():
    for type_real_estate, type_name in type_real_estates.items():
        url = base_url.format(location, type_real_estate, 1)
        result = requests.get(url)
        if result.status_code == 200:
            soup = BeautifulSoup(result.text, "html.parser")
        page = 1
        while True:
            items = soup.find_all("a", class_="AdItem_adItem__2O28x", href=True)
            print
            for item in items:
                href = item["href"]
                id_extract = re.findall(r"(\d+).htm", href)
                if len(id_extract) == 0:
                    continue
                print("[INFO] crawling item")
                result_item = requests.get(api.format(id_extract[0]))
                parse_data_api(type_name, id_extract[0], data, result_item.text)
                count += 1
                print("Crawled {} item".format(count))

            page += 1
            if page > TOTAL_PAGE:
                break


[INFO] crawling item
Crawled 1 item
[INFO] crawling item
Crawled 2 item
[INFO] crawling item
Crawled 3 item
[INFO] crawling item
Crawled 4 item
[INFO] crawling item
Crawled 5 item
[INFO] crawling item
Crawled 6 item
[INFO] crawling item
Crawled 7 item
[INFO] crawling item
Crawled 8 item
[INFO] crawling item
Crawled 9 item
[INFO] crawling item
Crawled 10 item
[INFO] crawling item
Crawled 11 item
[INFO] crawling item
Crawled 12 item
[INFO] crawling item
Crawled 13 item
[INFO] crawling item
Crawled 14 item
[INFO] crawling item
Crawled 15 item
[INFO] crawling item
Crawled 16 item
[INFO] crawling item
Crawled 17 item
[INFO] crawling item
Crawled 18 item
[INFO] crawling item
Crawled 19 item
[INFO] crawling item
Crawled 20 item
[INFO] crawling item
Crawled 21 item
[INFO] crawling item
Crawled 22 item
[INFO] crawling item
Crawled 23 item
[INFO] crawling item
Crawled 24 item
[INFO] crawling item
Crawled 25 item
[INFO] crawling item
Crawled 26 item
[INFO] crawling item
Crawled 27 item
[INFO] cra

In [10]:
df = pd.DataFrame(data)
df.to_csv('dataset_new.csv', index=False)