In [3]:
import requests
import pandas as pd
import time
import re

LISTING_URL = "https://gateway.chotot.com/v1/public/ad-listing"
DETAIL_URL = "https://gateway.chotot.com/v1/public/ad-listing/{}"


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Origin": "https://www.nhatot.com",
    "Referer": "https://www.nhatot.com/",
    "cookie": "__cf_bm=PASTE_YOUR_COOKIE_HERE"
}

# Thu√™ ph√≤ng tr·ªç t·∫°i ƒê√† N·∫µng
PARAMS_BASE = {
    "region_v2": 3017,  # ƒê√† N·∫µng
    "cg": 1050,         # Cho thu√™ ph√≤ng tr·ªç
    "st": "u,h",        # tr·∫°ng th√°i tin
    "limit": 50,        # s·ªë tin m·ªói trang
}

# ------------------ H√ÄM PH·ª§ TR·ª¢ ------------------
def detect_features(text):
    """Tr√≠ch xu·∫•t c√°c ti·ªán √≠ch t·ª´ m√¥ t·∫£"""
    if not text:
        return {"M√°y gi·∫∑t": 0, "T·ªß l·∫°nh": 0, "Wifi": 0, "ƒêi·ªÅu h√≤a": 0}
    t = text.lower()
    return {
        "M√°y gi·∫∑t": 1 if re.search(r"m√°y[\s\-]?gi·∫∑t|may giat|gi·∫∑t", t) else 0,
        "T·ªß l·∫°nh": 1 if re.search(r"t·ªß[\s\-]?l·∫°nh|tu lanh|tulanh", t) else 0,
        "Wifi": 1 if re.search(r"wi[-\s]?fi|wifi", t) else 0,
        "ƒêi·ªÅu h√≤a": 1 if re.search(r"ƒëi·ªÅu[\s\-]?h√≤a|dieu hoa|m√°y[\s\-]?l·∫°nh|ƒëh\b", t) else 0,
    }

def safe_get_detail(ad_id, headers, max_retries=3):
    """L·∫•y chi ti·∫øt tin ƒëƒÉng (m√¥ t·∫£ + tham s·ªë)"""
    backoff = 0.5
    for _ in range(max_retries):
        try:
            r = requests.get(DETAIL_URL.format(ad_id), headers=headers, timeout=10)
            if r.status_code == 200:
                d = r.json()
                params_dict = {p.get("id"): p.get("value") for p in d.get("params", []) if isinstance(p, dict)}
                desc = d.get("body", "") or ""
                return params_dict, desc
            elif r.status_code in (403, 429):
                time.sleep(backoff * 4)
            else:
                time.sleep(backoff)
        except Exception:
            time.sleep(backoff)
        backoff *= 1.8
    return {}, ""

# ------------------ C√ÄO D·ªÆ LI·ªÜU ------------------
all_data = []
offset = 0
limit = PARAMS_BASE["limit"]
page = 1

while True:
    params = {**PARAMS_BASE, "o": offset}
    try:
        r = requests.get(LISTING_URL, params=params, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            print(f"‚ö†Ô∏è L·ªói {r.status_code} t·∫°i offset={offset}")
            break
        data = r.json()
        ads = data.get("ads", [])
        total_ads = data.get("total_ads")
        print(f"üìÑ Trang {page} | Offset {offset} ‚Üí {len(ads)} tin (Total_ads={total_ads})")
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói k·∫øt n·ªëi: {e}")
        break

    if not ads:
        print("Kh√¥ng c√≤n d·ªØ li·ªáu ‚Üí D·ª´ng.")
        break

    for ad in ads:
        ad_id = ad.get("list_id")
        if not ad_id:
            continue

        price_str = ad.get("price_string", "")
        if not price_str:
            continue

        # --- L·∫•y chi ti·∫øt ---
        params_dict, desc = safe_get_detail(ad_id, HEADERS)
        if not desc:
            desc = ad.get("body", "") or ""

        features = detect_features(desc)

        all_data.append({
            "Location": f"{ad.get('area_name', '')}",
            "Price": price_str,
            "Area": f"{ad.get('size')} m¬≤" if ad.get("size") else None,
            **features,
        })

        time.sleep(0.3)

    offset += limit
    page += 1

    if total_ads and offset >= total_ads:
        print("üéØ ƒê√£ l·∫•y ƒë·ªß t·ªïng s·ªë tin API b√°o ‚Üí D·ª´ng.")
        break

    time.sleep(0.8)

# ------------------ L∆ØU FILE ------------------
# ------------------ L∆ØU FILE ------------------
df = pd.DataFrame(all_data)

# --- T√πy ch·ªçn ƒë·ªãnh d·∫°ng l∆∞u ---
save_as_csv = True  # üëâ ƒê·ªïi th√†nh False n·∫øu mu·ªën l∆∞u file Excel (.xlsx)

if save_as_csv:
    out_file = "nhatot_phongtro_danang_all.csv"
    df.to_csv(out_file, index=False, encoding="utf-8-sig")
else:
    out_file = "nhatot_phongtro_danang_all.xlsx"
    df.to_excel(out_file, index=False, engine="openpyxl")

print(f"\n‚úÖ Ho√†n t·∫•t! ƒê√£ l∆∞u {len(df)} tin v√†o {out_file}")


üìÑ Trang 1 | Offset 0 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 2 | Offset 50 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 3 | Offset 100 ‚Üí 50 tin (Total_ads=None)
üìÑ Trang 4 | Offset 150 ‚Üí 31 tin (Total_ads=None)
üìÑ Trang 5 | Offset 200 ‚Üí 0 tin (Total_ads=None)
Kh√¥ng c√≤n d·ªØ li·ªáu ‚Üí D·ª´ng.

‚úÖ Ho√†n t·∫•t! ƒê√£ l∆∞u 181 tin v√†o nhatot_phongtro_danang_all.csv
