In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time, re
from datetime import datetime, timedelta

base_url = "https://chothuephongtro.me"
headers = {"User-Agent": "Mozilla/5.0"}

records = []

# --- Crawl qua nhiều trang ---
for page_num in range(1, 100):
    url = f"{base_url}/da-nang.html?page={page_num}"
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    posts = soup.find_all("article", class_="post-item")
    if not posts:
        break  # hết trang

    for post in posts:
        price = post.find("span", class_="post-price")
        area = post.find("span", class_="acreage")
        date = post.find("span", class_="date")
        location = post.find("span", class_="location")

        record = {
            "location": location.get_text(strip=True) if location else None,
            "price": price.get_text(strip=True) if price else None,
            "area": area.get_text(strip=True) if area else None,
            "date_posted": date.get_text(strip=True) if date else None,
            "fridge": "0",
            "washer": "0",
            "air_condition": "0",
            "wifi": "0"
        }
        records.append(record)

    # nghỉ 0.5s mỗi trang để tránh bị chặn
    time.sleep(0.5)

# --- Tạo DataFrame ---
df = pd.DataFrame(records)

# --- Hàm làm sạch dữ liệu ---
def clean_price(price):
    if pd.isna(price):
        return None
    match = re.search(r'(\d+(\.\d+)?)', str(price))
    return float(match.group(1)) * 1_000_000 if match else None

def clean_area(area):
    if pd.isna(area):
        return None
    match = re.search(r"(\d+(\.\d+)?)", str(area))
    return float(match.group(1)) if match else None

def convert_date(text):
    today = datetime.today()
    if pd.isna(text):
        return None
    text = str(text).strip().lower()

    y = re.match(r'(\d+)\s*năm\s*trước', text)
    if y:
        return (today - pd.DateOffset(years=int(y.group(1)))).strftime('%Y-%m-%d')

    m = re.match(r'(\d+)\s*tháng\s*trước', text)
    if m:
        return (today - pd.DateOffset(months=int(m.group(1)))).strftime('%Y-%m-%d')

    w = re.match(r'(\d+)\s*tuần\s*trước', text)
    if w:
        return (today - timedelta(weeks=int(w.group(1)))).strftime('%Y-%m-%d')

    d = re.match(r'(\d+)\s*ngày\s*trước', text)
    if d:
        return (today - timedelta(days=int(d.group(1)))).strftime('%Y-%m-%d')

    # Nếu dạng dd/mm/yyyy
    if re.match(r'\d{2}/\d{2}/\d{4}', text):
        try:
            return datetime.strptime(text, "%d/%m/%Y").strftime("%Y-%m-%d")
        except:
            pass

    return None

# --- Áp dụng làm sạch ---
df["price"] = df["price"].apply(clean_price)
df["area"] = df["area"].apply(clean_area)
df["date_posted"] = df["date_posted"].apply(convert_date)

# --- In ra CSV để workflow đọc ---
print("location,price,area,date_posted,fridge,washer,air_condition,wifi")
for _, row in df.iterrows():
    print(
        f"{row['location'] or ''},{row['price'] or ''},{row['area'] or ''},{row['date_posted'] or ''},{row['fridge']},{row['washer']},{row['air_condition']},{row['wifi']}"
    )
