In [1]:
import requests
import pandas as pd
import time
import re
import datetime

# ------------------ API ------------------
LISTING_URL = "https://gateway.chotot.com/v1/public/ad-listing"
DETAIL_URL = "https://gateway.chotot.com/v1/public/ad-listing/{}"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Origin": "https://www.nhatot.com",
    "Referer": "https://www.nhatot.com/",
    "cookie": "__cf_bm=PASTE_YOUR_COOKIE_HERE"  # ‚ö†Ô∏è Thay cookie th·∫≠t khi ch·∫°y th·∫≠t
}

PARAMS_BASE = {
    "region_v2": 3017,  # ƒê√† N·∫µng
    "cg": 1050,         # Cho thu√™ ph√≤ng tr·ªç
    "st": "u,h",
    "limit": 50,
}

# ------------------ H√ÄM PH·ª§ ------------------
def detect_features(text):
    """Tr√≠ch xu·∫•t ti·ªán nghi t·ª´ m√¥ t·∫£"""
    if not text:
        return {"M√°y gi·∫∑t": 0, "T·ªß l·∫°nh": 0, "Wifi": 0, "ƒêi·ªÅu h√≤a": 0}
    t = text.lower()
    return {
        "M√°y gi·∫∑t": 1 if re.search(r"m√°y[\s\-]?gi·∫∑t|may giat|gi·∫∑t", t) else 0,
        "T·ªß l·∫°nh": 1 if re.search(r"t·ªß[\s\-]?l·∫°nh|tu lanh|tulanh", t) else 0,
        "Wifi": 1 if re.search(r"wi[-\s]?fi|wifi", t) else 0,
        "ƒêi·ªÅu h√≤a": 1 if re.search(r"ƒëi·ªÅu[\s\-]?h√≤a|dieu hoa|m√°y[\s\-]?l·∫°nh|ƒëh\b", t) else 0,
    }

def safe_get_detail(ad_id, headers, max_retries=3):
    """L·∫•y m√¥ t·∫£ chi ti·∫øt tin ƒëƒÉng"""
    backoff = 0.5
    for _ in range(max_retries):
        try:
            r = requests.get(DETAIL_URL.format(ad_id), headers=headers, timeout=10)
            if r.status_code == 200:
                d = r.json()
                return d.get("body", "") or ""
            time.sleep(backoff)
        except Exception:
            time.sleep(backoff)
        backoff *= 1.8
    return ""

# ------------------ C√ÄO D·ªÆ LI·ªÜU ------------------
all_data = []
offset = 0
limit = PARAMS_BASE["limit"]
page = 1

while True:
    params = {**PARAMS_BASE, "o": offset}
    try:
        r = requests.get(LISTING_URL, params=params, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            print(f"‚ö†Ô∏è L·ªói {r.status_code} t·∫°i offset={offset}")
            break
        data = r.json()
        ads = data.get("ads", [])
        total_ads = data.get("total_ads")
        print(f"üìÑ Trang {page} | Offset {offset} ‚Üí {len(ads)} tin (Total_ads={total_ads})")
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói k·∫øt n·ªëi: {e}")
        break

    if not ads:
        break

    for ad in ads:
        ad_id = ad.get("list_id")
        if not ad_id:
            continue

        price_str = ad.get("price_string", "")
        if not price_str:
            continue

        # --- Ng√†y ƒëƒÉng ---
        list_time = ad.get("list_time")
        post_date = None
        if list_time and isinstance(list_time, (int, float)):
            try:
                if list_time > 1e12:
                    list_time /= 1000
                post_date = datetime.datetime.fromtimestamp(list_time).strftime("%d/%m/%Y")
            except Exception:
                post_date = None

        # --- L·∫•y m√¥ t·∫£ ---
        desc = safe_get_detail(ad_id, HEADERS)
        if not desc:
            desc = ad.get("body", "") or ""

        features = detect_features(desc)

        all_data.append({
            "Location": ad.get("area_name", ""),
            "Price": price_str,
            "Area": f"{ad.get('size')} m¬≤" if ad.get("size") else None,
            "Ng√†y ƒëƒÉng": post_date,
            **features,
        })

        time.sleep(0.3)

    offset += limit
    page += 1

    if total_ads and offset >= total_ads:
        break

    time.sleep(0.8)

# ------------------ X·ª¨ L√ù D·ªÆ LI·ªÜU ------------------
df = pd.DataFrame(all_data)

def clean_price(price):
    if pd.isna(price):
        return None
    text = str(price).lower().strip()
    match = re.search(r'(\d+(\.\d+)?)', text)
    if not match:
        return None
    value = float(match.group(1))
    if "tri·ªáu" in text:
        value *= 1_000_000
    elif "ngh√¨n" in text or "k" in text:
        value *= 1_000
    elif "t·ª∑" in text:
        value *= 1_000_000_000
    return value

def clean_area(area):
    if pd.isna(area):
        return None
    match = re.search(r"(\d+(\.\d+)?)", str(area))
    return float(match.group(1)) if match else None

df["Price"] = df["Price"].apply(clean_price)
df["Area"] = df["Area"].apply(clean_area)

# ------------------ IN RA K·∫æT QU·∫¢ (CHO GITHUB ACTIONS) ------------------
print("\n‚úÖ Ho√†n t·∫•t c√†o d·ªØ li·ªáu!")
print(df.to_csv(index=False, encoding="utf-8-sig"))


üìÑ Trang 1 | Offset 0 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 2 | Offset 50 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 3 | Offset 100 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 4 | Offset 150 ‚Üí 31 tin (Total_ads=None)
üìÑ Trang 5 | Offset 200 ‚Üí 0 tin (Total_ads=None)

‚úÖ Ho√†n t·∫•t c√†o d·ªØ li·ªáu!
Location,Price,Area,Ng√†y ƒëƒÉng,M√°y gi·∫∑t,T·ªß l·∫°nh,Wifi,ƒêi·ªÅu h√≤a
Qu·∫≠n Ng≈© H√†nh S∆°n,5000000.0,65.0,03/11/2025,0,0,0,1
Qu·∫≠n Ng≈© H√†nh S∆°n,3000000.0,35.0,03/11/2025,1,0,0,1
Qu·∫≠n Ng≈© H√†nh S∆°n,4000000.0,30.0,03/11/2025,0,0,0,0
Qu·∫≠n H·∫£i Ch√¢u,2000000.0,18.0,03/11/2025,1,0,1,1
Qu·∫≠n Ng≈© H√†nh S∆°n,5000000.0,103.0,03/11/2025,0,0,0,0
Qu·∫≠n Thanh Kh√™,2000000.0,15.0,03/11/2025,0,1,1,1
Qu·∫≠n Thanh Kh√™,3000000.0,35.0,03/11/2025,1,1,1,1
Qu·∫≠n Ng≈© H√†nh S∆°n,3000000.0,30.0,03/11/2025,0,1,0,1
Qu·∫≠n S∆°n Tr√†,2000000.0,30.0,03/11/2025,0,0,0,0
Qu·∫≠n Ng≈© H√†nh S∆°n,2000000.0,20.0,03/11/2025,0,0,0,0
Qu·∫≠n Ng≈© H√†nh S∆°n,2000000.0,15.0,02/11/2025,1,0,1,1
Qu·∫≠n S∆°n Tr√†