In [2]:
!pip install beautifulsoup4 tqdm requests



In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import csv
import os

In [4]:
BASE_URL = "https://phongtro123.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

In [5]:
def get_post_links(page):
    url = f"{BASE_URL}?page={page}"
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    post_links = []
    for figure in soup.find_all("figure"):
        classes = figure.get("class", [])
        if any(cls.startswith("post__thumb") for cls in classes):
            a_tag = figure.find("a", href=True)
            if a_tag:
                href = a_tag['href']
                if href.startswith('/'):
                    href = "https://phongtro123.com" + href
                post_links.append(href)

    
    return post_links

In [6]:
def get_post_dates(soup):
    times = soup.find_all("time")
    if len(times) >= 2:
        post_date = times[1].get("title", "").strip()
        expire_date = times[2].get("title", "").strip()
        return post_date, expire_date
    else:
        return None, None

In [7]:
def get_post_id(soup):
    for td in soup.find_all("td", class_="pb-1"):
        text = td.get_text(strip=True)
        if text.startswith("#"):
            return text.lstrip("#")  # bỏ dấu #
    return None

In [8]:
def extract_description(soup):
    description = ""
    try:
        # 1. Tìm thẻ h2 chứa tiêu đề "Thông tin mô tả"
        h2_tag = soup.find('h2', string=lambda s: s and "Thông tin mô tả" in s)
        
        if h2_tag:
            # 2. Lấy thẻ cha (div) chứa toàn bộ phần mô tả
            desc_div = h2_tag.find_parent('div')
            
            # 3. Lấy toàn bộ <p> bên trong div này
            paragraphs = desc_div.find_all('p')
            description = '|'.join(p.get_text(strip=True) for p in paragraphs)
    except Exception as e:
        print(f"Lỗi khi lấy mô tả: {e}")
    
    return description

In [9]:
def crawl_post_data(post_url):
    try:
        res = requests.get(post_url, headers=HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')

        title = soup.select_one('h1.fs-4.fw-semibold.lh-sm.mb-25').text.strip()
        price = soup.select_one('span.text-green.fs-5.fw-bold').text.strip()
        address = ""
        for td in soup.select('td.pb-1'):
            if td.find('a') is not None:
                continue
            text = td.text.strip()
            if ("Phố" in text and "Thành" not in text) or "Phường" in text or "Đường" in text or "Xã" in text:
                address = text
                break

        divs = soup.select('div.d-flex')
        area = ""
        for div in divs:
            spans = div.find_all('span')
            if len(spans) >= 3:
                # Diện tích có thể nằm trong span thứ 3
                raw_area = spans[2].get_text(separator='', strip=True)  # gộp cả <sup>2>
                if "m" in raw_area:
                    area = raw_area
                    break
        description = extract_description(soup)
        contact_name = soup.select_one('div.fs-5.fw-medium.me-2').text.strip()
        phone = soup.select_one('a.btn.btn-green.text-white.d-flex.justify-content-center.rounded-4').text.strip()
        post_date, expire_date = get_post_dates(soup)
        post_id = get_post_id(soup)

        return {
            "title": title,
            "price": price,
            "address": address,
            "area": area,
            "description": description,
            "contact_name": contact_name,
            "phone": phone,
            "post_date": post_date,
            "expire_date": expire_date,
            "post_id": post_id,
            "url": post_url
        }
    except Exception as e:
        print(f"Error crawling {post_url}: {e}")
        return None

In [10]:
def save_to_csv(data, filename='motel_data.csv'):
    save_path = os.path.join('/kaggle/working', filename)
    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, keys)
        writer.writeheader()
        writer.writerows(data)

In [11]:
def crawl_many_pages(num_pages):
    results = []

    for page in range(1, num_pages + 1):
        print(f"Page {page}")
        post_links = get_post_links(page)

        for link in tqdm(post_links):
            data = crawl_post_data(link)
            if data:
                results.append(data)
            time.sleep(0.5)  # để tránh bị chặn IP

    return results

In [12]:
all_data = crawl_many_pages(num_pages=50)
save_to_csv(all_data)
print(f"Đã lưu {len(all_data)} bài đăng vào motel_data.csv")

Page 1


100%|██████████| 20/20 [00:34<00:00,  1.71s/it]


Page 2


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


Page 3


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


Page 4


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 5


100%|██████████| 20/20 [00:34<00:00,  1.70s/it]


Page 6


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


Page 7


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


Page 8


100%|██████████| 20/20 [00:31<00:00,  1.59s/it]


Page 9


100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


Page 10


100%|██████████| 20/20 [00:32<00:00,  1.63s/it]


Page 11


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 12


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


Page 13


100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


Page 14


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 15


100%|██████████| 20/20 [00:30<00:00,  1.51s/it]


Page 16


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


Page 17


100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


Page 18


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


Page 19


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


Page 20


100%|██████████| 20/20 [00:28<00:00,  1.43s/it]


Page 21


100%|██████████| 20/20 [00:29<00:00,  1.50s/it]


Page 22


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


Page 23


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


Page 24


100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


Page 25


100%|██████████| 20/20 [00:32<00:00,  1.63s/it]


Page 26


100%|██████████| 20/20 [00:30<00:00,  1.54s/it]


Page 27


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 28


100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


Page 29


100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


Page 30


100%|██████████| 20/20 [00:29<00:00,  1.48s/it]


Page 31


100%|██████████| 20/20 [00:32<00:00,  1.60s/it]


Page 32


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]


Page 33


100%|██████████| 20/20 [00:33<00:00,  1.69s/it]


Page 34


100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


Page 35


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]


Page 36


100%|██████████| 20/20 [00:31<00:00,  1.56s/it]


Page 37


100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


Page 38


100%|██████████| 20/20 [00:33<00:00,  1.69s/it]


Page 39


100%|██████████| 20/20 [00:30<00:00,  1.52s/it]


Page 40


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 41


100%|██████████| 20/20 [00:30<00:00,  1.50s/it]


Page 42


100%|██████████| 20/20 [00:31<00:00,  1.59s/it]


Page 43


100%|██████████| 20/20 [00:30<00:00,  1.55s/it]


Page 44


100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


Page 45


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]


Page 46


100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


Page 47


100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


Page 48


100%|██████████| 20/20 [00:33<00:00,  1.69s/it]


Page 49


100%|██████████| 20/20 [00:31<00:00,  1.59s/it]


Page 50


100%|██████████| 20/20 [00:31<00:00,  1.57s/it]

Đã lưu 1000 bài đăng vào phongtro_data.csv



